Iterator.h 8.29 KB
//===--- Iterator.h - Query Symbol Retrieval --------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// Symbol index queries consist of specific requirements for the requested
/// symbol, such as high fuzzy matching score, scope, type etc. The lists of all
/// symbols matching some criteria (e.g. belonging to "clang::clangd::" scope)
/// are expressed in a form of Search Tokens which are stored in the inverted
/// index. Inverted index maps these tokens to the posting lists - sorted (by
/// symbol quality) sequences of symbol IDs matching the token, e.g. scope token
/// "clangd::clangd::" is mapped to the list of IDs of all symbols which are
/// declared in this namespace. Search queries are build from a set of
/// requirements which can be combined with each other forming the query trees.
/// The leafs of such trees are posting lists, and the nodes are operations on
/// these posting lists, e.g. intersection or union. Efficient processing of
/// these multi-level queries is handled by Iterators. Iterators advance through
/// all leaf posting lists producing the result of search query, which preserves
/// the sorted order of IDs. Having the resulting IDs sorted is important,
/// because it allows receiving a certain number of the most valuable items
/// (e.g. symbols with highest quality which was the sorting key in the first
/// place) without processing all items with requested properties (this might
/// not be computationally effective if search request is not very restrictive).
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H
#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H

#include "llvm/ADT/ArrayRef.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include <memory>
#include <vector>

namespace clang {
namespace clangd {
namespace dex {

/// Symbol position in the list of all index symbols sorted by a pre-computed
/// symbol quality.
using DocID = uint32_t;

/// Iterator is the interface for Query Tree node. The simplest type of Iterator
/// is DocumentIterator which is simply a wrapper around PostingList iterator
/// and serves as the Query Tree leaf. More sophisticated examples of iterators
/// can manage intersection, union of the elements produced by other iterators
/// (their children) to form a multi-level Query Tree. The interface is designed
/// to be extensible in order to support multiple types of iterators.
class Iterator {
public:
  /// Returns true if all valid DocIDs were processed and hence the iterator is
  /// exhausted.
  virtual bool reachedEnd() const = 0;
  /// Moves to next valid DocID. If it doesn't exist, the iterator is exhausted
  /// and proceeds to the END.
  ///
  /// Note: reachedEnd() must be false.
  virtual void advance() = 0;
  /// Moves to the first valid DocID which is equal or higher than given ID. If
  /// it doesn't exist, the iterator is exhausted and proceeds to the END.
  ///
  /// Note: reachedEnd() must be false.
  virtual void advanceTo(DocID ID) = 0;
  /// Returns the current element this iterator points to.
  ///
  /// Note: reachedEnd() must be false.
  virtual DocID peek() const = 0;
  /// Informs the iterator that the current document was consumed, and returns
  /// its boost.
  ///
  /// Note: If this iterator has any child iterators that contain the document,
  /// consume() should be called on those and their boosts incorporated.
  /// consume() must *not* be called on children that don't contain the current
  /// doc.
  virtual float consume() = 0;
  /// Returns an estimate of advance() calls before the iterator is exhausted.
  virtual size_t estimateSize() const = 0;

  virtual ~Iterator() {}

  /// Prints a convenient human-readable iterator representation by recursively
  /// dumping iterators in the following format:
  ///
  /// (Type Child1 Child2 ...)
  ///
  /// Where Type is the iterator type representation: "&" for And, "|" for Or,
  /// ChildN is N-th iterator child. Raw iterators over PostingList are
  /// represented as "[... CurID ...]" where CurID is the current PostingList
  /// entry being inspected.
  friend llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
                                       const Iterator &Iterator) {
    return Iterator.dump(OS);
  }

  /// Inspect iterator type, used internally for optimizing query trees.
  enum class Kind { And, Or, True, False, Other };
  Kind kind() const { return MyKind; }

protected:
  Iterator(Kind MyKind = Kind::Other) : MyKind(MyKind) {}

private:
  virtual llvm::raw_ostream &dump(llvm::raw_ostream &OS) const = 0;
  Kind MyKind;
};

/// Advances the iterator until it is exhausted. Returns pairs of document IDs
/// with the corresponding boosting score.
///
/// Boosting can be seen as a compromise between retrieving too many items and
/// calculating finals score for each of them (which might be very expensive)
/// and not retrieving enough items so that items with very high final score
/// would not be processed. Boosting score is a computationally efficient way
/// to acquire preliminary scores of requested items.
std::vector<std::pair<DocID, float>> consume(Iterator &It);

namespace detail {
// Variadic template machinery.
inline void populateChildren(std::vector<std::unique_ptr<Iterator>> &) {}
template <typename... TailT>
void populateChildren(std::vector<std::unique_ptr<Iterator>> &Children,
                      std::unique_ptr<Iterator> Head, TailT... Tail) {
  Children.push_back(move(Head));
  populateChildren(Children, move(Tail)...);
}
} // namespace detail

// A corpus is a set of documents, and a factory for iterators over them.
class Corpus {
  DocID Size;

public:
  explicit Corpus(DocID Size) : Size(Size) {}

  /// Returns AND Iterator which performs the intersection of the PostingLists
  /// of its children.
  ///
  /// consume(): AND Iterator returns the product of Childrens' boosting
  /// scores.
  std::unique_ptr<Iterator>
  intersect(std::vector<std::unique_ptr<Iterator>> Children) const;

  /// Returns OR Iterator which performs the union of the PostingLists of its
  /// children.
  ///
  /// consume(): OR Iterator returns the highest boost value among children
  /// containing the requested item.
  std::unique_ptr<Iterator>
  unionOf(std::vector<std::unique_ptr<Iterator>> Children) const;

  /// Returns TRUE Iterator which iterates over "virtual" PostingList
  /// containing all items in range [0, Size) in an efficient manner.
  std::unique_ptr<Iterator> all() const;

  /// Returns FALSE Iterator which iterates over no documents.
  std::unique_ptr<Iterator> none() const;

  /// Returns BOOST iterator which multiplies the score of each item by given
  /// factor. Boosting can be used as a computationally inexpensive filtering.
  /// Users can return significantly more items using consumeAndBoost() and
  /// then trim Top K using retrieval score.
  std::unique_ptr<Iterator> boost(std::unique_ptr<Iterator> Child,
                                  float Factor) const;

  /// Returns LIMIT iterator, which yields up to N elements of its child
  /// iterator. Elements only count towards the limit if they are part of the
  /// final result set. Therefore the following iterator (AND (2) (LIMIT (1 2)
  /// 1)) yields (2), not ().
  std::unique_ptr<Iterator> limit(std::unique_ptr<Iterator> Child,
                                  size_t Limit) const;

  /// This allows intersect(create(...), create(...)) syntax.
  template <typename... Args>
  std::unique_ptr<Iterator> intersect(Args... args) const {
    std::vector<std::unique_ptr<Iterator>> Children;
    detail::populateChildren(Children, std::forward<Args>(args)...);
    return intersect(move(Children));
  }

  /// This allows unionOf(create(...), create(...)) syntax.
  template <typename... Args>
  std::unique_ptr<Iterator> unionOf(Args... args) const {
    std::vector<std::unique_ptr<Iterator>> Children;
    detail::populateChildren(Children, std::forward<Args>(args)...);
    return unionOf(move(Children));
  }
};

} // namespace dex
} // namespace clangd
} // namespace clang

#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H