CompletionModelCodegen.py
10.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
"""Code generator for Code Completion Model Inference.
Tool runs on the Decision Forest model defined in {model} directory.
It generates two files: {output_dir}/{filename}.h and {output_dir}/{filename}.cpp
The generated files defines the Example class named {cpp_class} having all the features as class members.
The generated runtime provides an `Evaluate` function which can be used to score a code completion candidate.
"""
import argparse
import json
import struct
class CppClass:
"""Holds class name and names of the enclosing namespaces."""
def __init__(self, cpp_class):
ns_and_class = cpp_class.split("::")
self.ns = [ns for ns in ns_and_class[0:-1] if len(ns) > 0]
self.name = ns_and_class[-1]
if len(self.name) == 0:
raise ValueError("Empty class name.")
def ns_begin(self):
"""Returns snippet for opening namespace declarations."""
open_ns = ["namespace %s {" % ns for ns in self.ns]
return "\n".join(open_ns)
def ns_end(self):
"""Returns snippet for closing namespace declarations."""
close_ns = [
"} // namespace %s" % ns for ns in reversed(self.ns)]
return "\n".join(close_ns)
def header_guard(filename):
'''Returns the header guard for the generated header.'''
return "GENERATED_DECISION_FOREST_MODEL_%s_H" % filename.upper()
def boost_node(n, label, next_label):
"""Returns code snippet for a leaf/boost node."""
return "%s: return %s;" % (label, n['score'])
def if_greater_node(n, label, next_label):
"""Returns code snippet for a if_greater node.
Jumps to true_label if the Example feature (NUMBER) is greater than the threshold.
Comparing integers is much faster than comparing floats. Assuming floating points
are represented as IEEE 754, it order-encodes the floats to integers before comparing them.
Control falls through if condition is evaluated to false."""
threshold = n["threshold"]
return "%s: if (E.get%s() >= %s /*%s*/) goto %s;" % (
label, n['feature'], order_encode(threshold), threshold, next_label)
def if_member_node(n, label, next_label):
"""Returns code snippet for a if_member node.
Jumps to true_label if the Example feature (ENUM) is present in the set of enum values
described in the node.
Control falls through if condition is evaluated to false."""
members = '|'.join([
"BIT(%s_type::%s)" % (n['feature'], member)
for member in n["set"]
])
return "%s: if (E.get%s() & (%s)) goto %s;" % (
label, n['feature'], members, next_label)
def node(n, label, next_label):
"""Returns code snippet for the node."""
return {
'boost': boost_node,
'if_greater': if_greater_node,
'if_member': if_member_node,
}[n['operation']](n, label, next_label)
def tree(t, tree_num, node_num):
"""Returns code for inferencing a Decision Tree.
Also returns the size of the decision tree.
A tree starts with its label `t{tree#}`.
A node of the tree starts with label `t{tree#}_n{node#}`.
The tree contains two types of node: Conditional node and Leaf node.
- Conditional node evaluates a condition. If true, it jumps to the true node/child.
Code is generated using pre-order traversal of the tree considering
false node as the first child. Therefore the false node is always the
immediately next label.
- Leaf node adds the value to the score and jumps to the next tree.
"""
label = "t%d_n%d" % (tree_num, node_num)
code = []
if t["operation"] == "boost":
code.append(node(t, label=label, next_label="t%d" % (tree_num + 1)))
return code, 1
false_code, false_size = tree(
t['else'], tree_num=tree_num, node_num=node_num+1)
true_node_num = node_num+false_size+1
true_label = "t%d_n%d" % (tree_num, true_node_num)
true_code, true_size = tree(
t['then'], tree_num=tree_num, node_num=true_node_num)
code.append(node(t, label=label, next_label=true_label))
return code+false_code+true_code, 1+false_size+true_size
def gen_header_code(features_json, cpp_class, filename):
"""Returns code for header declaring the inference runtime.
Declares the Example class named {cpp_class} inside relevant namespaces.
The Example class contains all the features as class members. This
class can be used to represent a code completion candidate.
Provides `float Evaluate()` function which can be used to score the Example.
"""
setters = []
getters = []
for f in features_json:
feature = f["name"]
if f["kind"] == "NUMBER":
# Floats are order-encoded to integers for faster comparison.
setters.append(
"void set%s(float V) { %s = OrderEncode(V); }" % (
feature, feature))
elif f["kind"] == "ENUM":
setters.append(
"void set%s(unsigned V) { %s = 1 << V; }" % (feature, feature))
else:
raise ValueError("Unhandled feature type.", f["kind"])
# Class members represent all the features of the Example.
class_members = [
"uint32_t %s = 0;" % f['name']
for f in features_json
]
getters = [
"LLVM_ATTRIBUTE_ALWAYS_INLINE uint32_t get%s() const { return %s; }"
% (f['name'], f['name'])
for f in features_json
]
nline = "\n "
guard = header_guard(filename)
return """#ifndef %s
#define %s
#include <cstdint>
#include "llvm/Support/Compiler.h"
%s
class %s {
public:
// Setters.
%s
// Getters.
%s
private:
%s
// Produces an integer that sorts in the same order as F.
// That is: a < b <==> orderEncode(a) < orderEncode(b).
static uint32_t OrderEncode(float F);
};
float Evaluate(const %s&);
%s
#endif // %s
""" % (guard, guard, cpp_class.ns_begin(), cpp_class.name,
nline.join(setters),
nline.join(getters),
nline.join(class_members),
cpp_class.name, cpp_class.ns_end(), guard)
def order_encode(v):
i = struct.unpack('<I', struct.pack('<f', v))[0]
TopBit = 1 << 31
# IEEE 754 floats compare like sign-magnitude integers.
if (i & TopBit): # Negative float
return (1 << 32) - i # low half of integers, order reversed.
return TopBit + i # top half of integers
def evaluate_func(forest_json, cpp_class):
"""Generates evaluation functions for each tree and combines them in
`float Evaluate(const {Example}&)` function. This function can be
used to score an Example."""
code = ""
# Generate evaluation function of each tree.
code += "namespace {\n"
tree_num = 0
for tree_json in forest_json:
code += "LLVM_ATTRIBUTE_NOINLINE float EvaluateTree%d(const %s& E) {\n" % (tree_num, cpp_class.name)
code += " " + \
"\n ".join(
tree(tree_json, tree_num=tree_num, node_num=0)[0]) + "\n"
code += "}\n\n"
tree_num += 1
code += "} // namespace\n\n"
# Combine the scores of all trees in the final function.
# MSAN will timeout if these functions are inlined.
code += "float Evaluate(const %s& E) {\n" % cpp_class.name
code += " float Score = 0;\n"
for tree_num in range(len(forest_json)):
code += " Score += EvaluateTree%d(E);\n" % tree_num
code += " return Score;\n"
code += "}\n"
return code
def gen_cpp_code(forest_json, features_json, filename, cpp_class):
"""Generates code for the .cpp file."""
# Headers
# Required by OrderEncode(float F).
angled_include = [
'#include <%s>' % h
for h in ["cstring", "limits"]
]
# Include generated header.
qouted_headers = {filename + '.h', 'llvm/ADT/bit.h'}
# Headers required by ENUM features used by the model.
qouted_headers |= {f["header"]
for f in features_json if f["kind"] == "ENUM"}
quoted_include = ['#include "%s"' % h for h in sorted(qouted_headers)]
# using-decl for ENUM features.
using_decls = "\n".join("using %s_type = %s;" % (
feature['name'], feature['type'])
for feature in features_json
if feature["kind"] == "ENUM")
nl = "\n"
return """%s
%s
#define BIT(X) (1 << X)
%s
%s
uint32_t %s::OrderEncode(float F) {
static_assert(std::numeric_limits<float>::is_iec559, "");
constexpr uint32_t TopBit = ~(~uint32_t{0} >> 1);
// Get the bits of the float. Endianness is the same as for integers.
uint32_t U = llvm::bit_cast<uint32_t>(F);
std::memcpy(&U, &F, sizeof(U));
// IEEE 754 floats compare like sign-magnitude integers.
if (U & TopBit) // Negative float.
return 0 - U; // Map onto the low half of integers, order reversed.
return U + TopBit; // Positive floats map onto the high half of integers.
}
%s
%s
""" % (nl.join(angled_include), nl.join(quoted_include), cpp_class.ns_begin(),
using_decls, cpp_class.name, evaluate_func(forest_json, cpp_class),
cpp_class.ns_end())
def main():
parser = argparse.ArgumentParser('DecisionForestCodegen')
parser.add_argument('--filename', help='output file name.')
parser.add_argument('--output_dir', help='output directory.')
parser.add_argument('--model', help='path to model directory.')
parser.add_argument(
'--cpp_class',
help='The name of the class (which may be a namespace-qualified) created in generated header.'
)
ns = parser.parse_args()
output_dir = ns.output_dir
filename = ns.filename
header_file = "%s/%s.h" % (output_dir, filename)
cpp_file = "%s/%s.cpp" % (output_dir, filename)
cpp_class = CppClass(cpp_class=ns.cpp_class)
model_file = "%s/forest.json" % ns.model
features_file = "%s/features.json" % ns.model
with open(features_file) as f:
features_json = json.load(f)
with open(model_file) as m:
forest_json = json.load(m)
with open(cpp_file, 'w+t') as output_cc:
output_cc.write(
gen_cpp_code(forest_json=forest_json,
features_json=features_json,
filename=filename,
cpp_class=cpp_class))
with open(header_file, 'w+t') as output_h:
output_h.write(gen_header_code(
features_json=features_json,
cpp_class=cpp_class,
filename=filename))
if __name__ == '__main__':
main()