Showing
1 changed file
with
143 additions
and
17 deletions
... | @@ -14,6 +14,13 @@ | ... | @@ -14,6 +14,13 @@ |
14 | 14 | ||
15 | using namespace std; | 15 | using namespace std; |
16 | 16 | ||
17 | +enum MODE { | ||
18 | + NONE = 0, | ||
19 | + PAPER = 1, | ||
20 | + COAUTHOR = 2, | ||
21 | +}; | ||
22 | +const int mode = MODE::PAPER | MODE::COAUTHOR; | ||
23 | + | ||
17 | const char* DBLP_FILENAME = "dblp.json"; | 24 | const char* DBLP_FILENAME = "dblp.json"; |
18 | const char* DBLP_COAUTHOR_FILENAME = "tmp_dblp_coauthorship.json"; | 25 | const char* DBLP_COAUTHOR_FILENAME = "tmp_dblp_coauthorship.json"; |
19 | const string COLUMN_DELIMITER = "||"; | 26 | const string COLUMN_DELIMITER = "||"; |
... | @@ -159,29 +166,148 @@ struct DblpPaperHandler { | ... | @@ -159,29 +166,148 @@ struct DblpPaperHandler { |
159 | } | 166 | } |
160 | }; | 167 | }; |
161 | 168 | ||
169 | +struct CoauthorRecord { | ||
170 | + string author1, author2; | ||
171 | + unsigned int year; | ||
162 | 172 | ||
163 | -int main() { | 173 | + void write(ofstream& fout) { |
164 | - try { | 174 | + fout << author1 << COLUMN_DELIMITER |
165 | - ifstream dblp_paper_in, dblp_coauthor_in; | 175 | + << author2 << COLUMN_DELIMITER |
166 | - ofstream dblp_paper_out, dblp_coauthor_out; | 176 | + << year << endl; |
167 | - dblp_paper_in.open(DBLP_FILENAME); | 177 | + } |
168 | - dblp_paper_out.open((string(DBLP_FILENAME)+string(".out")).c_str()); | 178 | +}; |
169 | - if (!dblp_paper_in || !dblp_paper_out) { | 179 | + |
170 | - throw exception("dblp paper file"); | 180 | +struct DblpCoauthorHandler { |
181 | + bool whole_array = false; | ||
182 | + bool is_record = false; | ||
183 | + int read_author_cnt = 0; | ||
184 | + uint64_t record_count = 0; | ||
185 | + | ||
186 | + CoauthorRecord coauthor; | ||
187 | + ofstream& ofs; | ||
188 | + | ||
189 | + DblpCoauthorHandler(ofstream& fout) | ||
190 | + : ofs(fout) { | ||
191 | + } | ||
192 | + | ||
193 | + // | ||
194 | + bool Null() { | ||
195 | + return true; | ||
196 | + } | ||
197 | + bool Bool(bool b) { | ||
198 | + //cout << "Bool(" << boolalpha << b << ")" << endl; | ||
199 | + return true; | ||
200 | + } | ||
201 | + bool Int(int i) { | ||
202 | + //cout << "Int(" << i << ")" << endl; | ||
203 | + return true; | ||
204 | + } | ||
205 | + bool Uint(unsigned u) { | ||
206 | + coauthor.year = u; | ||
207 | + return true; | ||
208 | + } | ||
209 | + bool Int64(int64_t i) { | ||
210 | + //cout << "Int64(" << i << ")" << endl; | ||
211 | + return true; | ||
212 | + } | ||
213 | + bool Uint64(uint64_t u) { | ||
214 | + //cout << "Uint64(" << u << ")" << endl; | ||
215 | + return true; | ||
216 | + } | ||
217 | + bool Double(double d) { | ||
218 | + //cout << "Double(" << d << ")" << endl; | ||
219 | + return true; | ||
220 | + } | ||
221 | + bool RawNumber(const char* str, rapidjson::SizeType length, bool copy) { | ||
222 | + //cout << "Number(" << str << ", " << length << ", " << boolalpha << copy << ")" << endl; | ||
223 | + return true; | ||
224 | + } | ||
225 | + bool String(const char* str, rapidjson::SizeType length, bool copy) { | ||
226 | + if (is_record) { | ||
227 | + if (read_author_cnt == 0) { | ||
228 | + coauthor.author1 = string(str); | ||
229 | + } else { | ||
230 | + coauthor.author2 = string(str); | ||
231 | + } | ||
232 | + | ||
233 | + ++read_author_cnt; | ||
234 | + } | ||
235 | + return true; | ||
236 | + } | ||
237 | + bool StartObject() { | ||
238 | + //cout << "StartObject()" << endl; | ||
239 | + return true; | ||
240 | + } | ||
241 | + bool Key(const char* str, rapidjson::SizeType length, bool copy) { | ||
242 | + //cout << "Key(" << str << ", " << length << ", " << boolalpha << copy << ")" << endl; | ||
243 | + return true; | ||
244 | + } | ||
245 | + bool EndObject(rapidjson::SizeType memberCount) { | ||
246 | + //cout << "EndObject(" << memberCount << ")" << endl; | ||
247 | + return true; | ||
248 | + } | ||
249 | + bool StartArray() { | ||
250 | + if (!whole_array) { | ||
251 | + whole_array = true; | ||
252 | + } else if (!is_record) { | ||
253 | + is_record = true; | ||
254 | + read_author_cnt = 0; | ||
255 | + } | ||
256 | + return true; | ||
257 | + } | ||
258 | + bool EndArray(rapidjson::SizeType elementCount) { | ||
259 | + if (is_record) { | ||
260 | + coauthor.write(ofs); | ||
261 | + is_record = false; | ||
262 | + ++record_count; | ||
263 | + if (record_count % 100000 == 0) { | ||
264 | + printf("* [%" PRIu64 "] \n", record_count); | ||
265 | + } | ||
266 | + } else { | ||
267 | + whole_array = false; | ||
268 | + printf("* total paper record: [%" PRIu64 "]\n", record_count); | ||
171 | } | 269 | } |
270 | + return true; | ||
271 | + } | ||
272 | +}; | ||
273 | + | ||
274 | + | ||
275 | +int main(int argc, char* argv[]) { | ||
276 | + rapidjson::Reader reader; | ||
277 | + try { | ||
278 | + if (mode & MODE::PAPER) { | ||
279 | + ifstream dblp_paper_in; | ||
280 | + ofstream dblp_paper_out; | ||
281 | + dblp_paper_in.open(DBLP_FILENAME); | ||
282 | + dblp_paper_out.open((string(DBLP_FILENAME)+string(".out")).c_str()); | ||
283 | + if (!dblp_paper_in || !dblp_paper_out) { | ||
284 | + throw exception("dblp paper file"); | ||
285 | + } | ||
172 | 286 | ||
173 | - DblpPaperHandler paper_handler(dblp_paper_out); | 287 | + DblpPaperHandler paper_handler(dblp_paper_out); |
288 | + rapidjson::IStreamWrapper dblp_paper_isw(dblp_paper_in); | ||
289 | + reader.Parse(dblp_paper_isw, paper_handler); | ||
174 | 290 | ||
175 | - rapidjson::IStreamWrapper dblp_paper_isw(dblp_paper_in); | 291 | + if (dblp_paper_in) dblp_paper_in.close(); |
176 | - rapidjson::Reader reader; | 292 | + if (dblp_paper_out) dblp_paper_out.close(); |
293 | + } | ||
177 | 294 | ||
178 | - reader.Parse(dblp_paper_isw, paper_handler); | 295 | + if (mode & MODE::COAUTHOR) { |
296 | + ifstream dblp_coauthor_in; | ||
297 | + ofstream dblp_coauthor_out; | ||
298 | + dblp_coauthor_in.open(DBLP_COAUTHOR_FILENAME); | ||
299 | + dblp_coauthor_out.open((string(DBLP_COAUTHOR_FILENAME)+string(".out")).c_str()); | ||
300 | + if (!dblp_coauthor_in || !dblp_coauthor_out) { | ||
301 | + throw exception("dblp coauthor file"); | ||
302 | + } | ||
179 | 303 | ||
180 | - //release | 304 | + DblpCoauthorHandler coauthor_handler(dblp_coauthor_out); |
181 | - if (dblp_paper_in) dblp_paper_in.close(); | 305 | + rapidjson::IStreamWrapper dblp_coauthor_isw(dblp_coauthor_in); |
182 | - if (dblp_paper_out) dblp_paper_out.close(); | 306 | + reader.Parse(dblp_coauthor_isw, coauthor_handler); |
183 | - if (dblp_coauthor_in) dblp_coauthor_in.close(); | 307 | + |
184 | - if (dblp_coauthor_out) dblp_coauthor_out.close(); | 308 | + if (dblp_coauthor_in) dblp_coauthor_in.close(); |
309 | + if (dblp_coauthor_out) dblp_coauthor_out.close(); | ||
310 | + } | ||
185 | } | 311 | } |
186 | catch (const exception& e) { | 312 | catch (const exception& e) { |
187 | cerr << "Error: " << e.what() << endl; | 313 | cerr << "Error: " << e.what() << endl; | ... | ... |
-
Please register or login to post a comment