FuzzySymbolIndex.cpp
3.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
//===--- FuzzySymbolIndex.cpp - Lookup symbols for autocomplete -*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "FuzzySymbolIndex.h"
#include "llvm/Support/Regex.h"
using clang::find_all_symbols::SymbolAndSignals;
using llvm::StringRef;
namespace clang {
namespace include_fixer {
namespace {
class MemSymbolIndex : public FuzzySymbolIndex {
public:
MemSymbolIndex(std::vector<SymbolAndSignals> Symbols) {
for (auto &Symbol : Symbols) {
auto Tokens = tokenize(Symbol.Symbol.getName());
this->Symbols.emplace_back(
StringRef(llvm::join(Tokens.begin(), Tokens.end(), " ")),
std::move(Symbol));
}
}
std::vector<SymbolAndSignals> search(StringRef Query) override {
auto Tokens = tokenize(Query);
llvm::Regex Pattern("^" + queryRegexp(Tokens));
std::vector<SymbolAndSignals> Results;
for (const Entry &E : Symbols)
if (Pattern.match(E.first))
Results.push_back(E.second);
return Results;
}
private:
using Entry = std::pair<llvm::SmallString<32>, SymbolAndSignals>;
std::vector<Entry> Symbols;
};
// Helpers for tokenize state machine.
enum TokenizeState {
EMPTY, // No pending characters.
ONE_BIG, // Read one uppercase letter, could be WORD or Word.
BIG_WORD, // Reading an uppercase WORD.
SMALL_WORD, // Reading a lowercase word.
NUMBER // Reading a number.
};
enum CharType { UPPER, LOWER, DIGIT, MISC };
CharType classify(char c) {
if (isupper(c))
return UPPER;
if (islower(c))
return LOWER;
if (isdigit(c))
return DIGIT;
return MISC;
}
} // namespace
std::vector<std::string> FuzzySymbolIndex::tokenize(StringRef Text) {
std::vector<std::string> Result;
// State describes the treatment of text from Start to I.
// Once text is Flush()ed into Result, we're done with it and advance Start.
TokenizeState State = EMPTY;
size_t Start = 0;
auto Flush = [&](size_t End) {
if (State != EMPTY) {
Result.push_back(Text.substr(Start, End - Start).lower());
State = EMPTY;
}
Start = End;
};
for (size_t I = 0; I < Text.size(); ++I) {
CharType Type = classify(Text[I]);
if (Type == MISC)
Flush(I);
else if (Type == LOWER)
switch (State) {
case BIG_WORD:
Flush(I - 1); // FOOBar: first token is FOO, not FOOB.
LLVM_FALLTHROUGH;
case ONE_BIG:
State = SMALL_WORD;
LLVM_FALLTHROUGH;
case SMALL_WORD:
break;
default:
Flush(I);
State = SMALL_WORD;
}
else if (Type == UPPER)
switch (State) {
case ONE_BIG:
State = BIG_WORD;
LLVM_FALLTHROUGH;
case BIG_WORD:
break;
default:
Flush(I);
State = ONE_BIG;
}
else if (Type == DIGIT && State != NUMBER) {
Flush(I);
State = NUMBER;
}
}
Flush(Text.size());
return Result;
}
std::string
FuzzySymbolIndex::queryRegexp(const std::vector<std::string> &Tokens) {
std::string Result;
for (size_t I = 0; I < Tokens.size(); ++I) {
if (I)
Result.append("[[:alnum:]]* ");
for (size_t J = 0; J < Tokens[I].size(); ++J) {
if (J)
Result.append("([[:alnum:]]* )?");
Result.push_back(Tokens[I][J]);
}
}
return Result;
}
llvm::Expected<std::unique_ptr<FuzzySymbolIndex>>
FuzzySymbolIndex::createFromYAML(StringRef FilePath) {
auto Buffer = llvm::MemoryBuffer::getFile(FilePath);
if (!Buffer)
return llvm::errorCodeToError(Buffer.getError());
return std::make_unique<MemSymbolIndex>(
find_all_symbols::ReadSymbolInfosFromYAML(Buffer.get()->getBuffer()));
}
} // namespace include_fixer
} // namespace clang