Showing
1 changed file
with
133 additions
and
7 deletions
... | @@ -14,6 +14,13 @@ | ... | @@ -14,6 +14,13 @@ |
14 | 14 | ||
15 | using namespace std; | 15 | using namespace std; |
16 | 16 | ||
17 | +enum MODE { | ||
18 | + NONE = 0, | ||
19 | + PAPER = 1, | ||
20 | + COAUTHOR = 2, | ||
21 | +}; | ||
22 | +const int mode = MODE::PAPER | MODE::COAUTHOR; | ||
23 | + | ||
17 | const char* DBLP_FILENAME = "dblp.json"; | 24 | const char* DBLP_FILENAME = "dblp.json"; |
18 | const char* DBLP_COAUTHOR_FILENAME = "tmp_dblp_coauthorship.json"; | 25 | const char* DBLP_COAUTHOR_FILENAME = "tmp_dblp_coauthorship.json"; |
19 | const string COLUMN_DELIMITER = "||"; | 26 | const string COLUMN_DELIMITER = "||"; |
... | @@ -159,11 +166,118 @@ struct DblpPaperHandler { | ... | @@ -159,11 +166,118 @@ struct DblpPaperHandler { |
159 | } | 166 | } |
160 | }; | 167 | }; |
161 | 168 | ||
169 | +struct CoauthorRecord { | ||
170 | + string author1, author2; | ||
171 | + unsigned int year; | ||
172 | + | ||
173 | + void write(ofstream& fout) { | ||
174 | + fout << author1 << COLUMN_DELIMITER | ||
175 | + << author2 << COLUMN_DELIMITER | ||
176 | + << year << endl; | ||
177 | + } | ||
178 | +}; | ||
179 | + | ||
180 | +struct DblpCoauthorHandler { | ||
181 | + bool whole_array = false; | ||
182 | + bool is_record = false; | ||
183 | + int read_author_cnt = 0; | ||
184 | + uint64_t record_count = 0; | ||
185 | + | ||
186 | + CoauthorRecord coauthor; | ||
187 | + ofstream& ofs; | ||
188 | + | ||
189 | + DblpCoauthorHandler(ofstream& fout) | ||
190 | + : ofs(fout) { | ||
191 | + } | ||
162 | 192 | ||
163 | -int main() { | 193 | + // |
194 | + bool Null() { | ||
195 | + return true; | ||
196 | + } | ||
197 | + bool Bool(bool b) { | ||
198 | + //cout << "Bool(" << boolalpha << b << ")" << endl; | ||
199 | + return true; | ||
200 | + } | ||
201 | + bool Int(int i) { | ||
202 | + //cout << "Int(" << i << ")" << endl; | ||
203 | + return true; | ||
204 | + } | ||
205 | + bool Uint(unsigned u) { | ||
206 | + coauthor.year = u; | ||
207 | + return true; | ||
208 | + } | ||
209 | + bool Int64(int64_t i) { | ||
210 | + //cout << "Int64(" << i << ")" << endl; | ||
211 | + return true; | ||
212 | + } | ||
213 | + bool Uint64(uint64_t u) { | ||
214 | + //cout << "Uint64(" << u << ")" << endl; | ||
215 | + return true; | ||
216 | + } | ||
217 | + bool Double(double d) { | ||
218 | + //cout << "Double(" << d << ")" << endl; | ||
219 | + return true; | ||
220 | + } | ||
221 | + bool RawNumber(const char* str, rapidjson::SizeType length, bool copy) { | ||
222 | + //cout << "Number(" << str << ", " << length << ", " << boolalpha << copy << ")" << endl; | ||
223 | + return true; | ||
224 | + } | ||
225 | + bool String(const char* str, rapidjson::SizeType length, bool copy) { | ||
226 | + if (is_record) { | ||
227 | + if (read_author_cnt == 0) { | ||
228 | + coauthor.author1 = string(str); | ||
229 | + } else { | ||
230 | + coauthor.author2 = string(str); | ||
231 | + } | ||
232 | + | ||
233 | + ++read_author_cnt; | ||
234 | + } | ||
235 | + return true; | ||
236 | + } | ||
237 | + bool StartObject() { | ||
238 | + //cout << "StartObject()" << endl; | ||
239 | + return true; | ||
240 | + } | ||
241 | + bool Key(const char* str, rapidjson::SizeType length, bool copy) { | ||
242 | + //cout << "Key(" << str << ", " << length << ", " << boolalpha << copy << ")" << endl; | ||
243 | + return true; | ||
244 | + } | ||
245 | + bool EndObject(rapidjson::SizeType memberCount) { | ||
246 | + //cout << "EndObject(" << memberCount << ")" << endl; | ||
247 | + return true; | ||
248 | + } | ||
249 | + bool StartArray() { | ||
250 | + if (!whole_array) { | ||
251 | + whole_array = true; | ||
252 | + } else if (!is_record) { | ||
253 | + is_record = true; | ||
254 | + read_author_cnt = 0; | ||
255 | + } | ||
256 | + return true; | ||
257 | + } | ||
258 | + bool EndArray(rapidjson::SizeType elementCount) { | ||
259 | + if (is_record) { | ||
260 | + coauthor.write(ofs); | ||
261 | + is_record = false; | ||
262 | + ++record_count; | ||
263 | + if (record_count % 100000 == 0) { | ||
264 | + printf("* [%" PRIu64 "] \n", record_count); | ||
265 | + } | ||
266 | + } else { | ||
267 | + whole_array = false; | ||
268 | + printf("* total paper record: [%" PRIu64 "]\n", record_count); | ||
269 | + } | ||
270 | + return true; | ||
271 | + } | ||
272 | +}; | ||
273 | + | ||
274 | + | ||
275 | +int main(int argc, char* argv[]) { | ||
276 | + rapidjson::Reader reader; | ||
164 | try { | 277 | try { |
165 | - ifstream dblp_paper_in, dblp_coauthor_in; | 278 | + if (mode & MODE::PAPER) { |
166 | - ofstream dblp_paper_out, dblp_coauthor_out; | 279 | + ifstream dblp_paper_in; |
280 | + ofstream dblp_paper_out; | ||
167 | dblp_paper_in.open(DBLP_FILENAME); | 281 | dblp_paper_in.open(DBLP_FILENAME); |
168 | dblp_paper_out.open((string(DBLP_FILENAME)+string(".out")).c_str()); | 282 | dblp_paper_out.open((string(DBLP_FILENAME)+string(".out")).c_str()); |
169 | if (!dblp_paper_in || !dblp_paper_out) { | 283 | if (!dblp_paper_in || !dblp_paper_out) { |
... | @@ -171,18 +285,30 @@ int main() { | ... | @@ -171,18 +285,30 @@ int main() { |
171 | } | 285 | } |
172 | 286 | ||
173 | DblpPaperHandler paper_handler(dblp_paper_out); | 287 | DblpPaperHandler paper_handler(dblp_paper_out); |
174 | - | ||
175 | rapidjson::IStreamWrapper dblp_paper_isw(dblp_paper_in); | 288 | rapidjson::IStreamWrapper dblp_paper_isw(dblp_paper_in); |
176 | - rapidjson::Reader reader; | ||
177 | - | ||
178 | reader.Parse(dblp_paper_isw, paper_handler); | 289 | reader.Parse(dblp_paper_isw, paper_handler); |
179 | 290 | ||
180 | - //release | ||
181 | if (dblp_paper_in) dblp_paper_in.close(); | 291 | if (dblp_paper_in) dblp_paper_in.close(); |
182 | if (dblp_paper_out) dblp_paper_out.close(); | 292 | if (dblp_paper_out) dblp_paper_out.close(); |
293 | + } | ||
294 | + | ||
295 | + if (mode & MODE::COAUTHOR) { | ||
296 | + ifstream dblp_coauthor_in; | ||
297 | + ofstream dblp_coauthor_out; | ||
298 | + dblp_coauthor_in.open(DBLP_COAUTHOR_FILENAME); | ||
299 | + dblp_coauthor_out.open((string(DBLP_COAUTHOR_FILENAME)+string(".out")).c_str()); | ||
300 | + if (!dblp_coauthor_in || !dblp_coauthor_out) { | ||
301 | + throw exception("dblp coauthor file"); | ||
302 | + } | ||
303 | + | ||
304 | + DblpCoauthorHandler coauthor_handler(dblp_coauthor_out); | ||
305 | + rapidjson::IStreamWrapper dblp_coauthor_isw(dblp_coauthor_in); | ||
306 | + reader.Parse(dblp_coauthor_isw, coauthor_handler); | ||
307 | + | ||
183 | if (dblp_coauthor_in) dblp_coauthor_in.close(); | 308 | if (dblp_coauthor_in) dblp_coauthor_in.close(); |
184 | if (dblp_coauthor_out) dblp_coauthor_out.close(); | 309 | if (dblp_coauthor_out) dblp_coauthor_out.close(); |
185 | } | 310 | } |
311 | + } | ||
186 | catch (const exception& e) { | 312 | catch (const exception& e) { |
187 | cerr << "Error: " << e.what() << endl; | 313 | cerr << "Error: " << e.what() << endl; |
188 | return -1; | 314 | return -1; | ... | ... |
-
Please register or login to post a comment