조성현

added coauthor parser

...@@ -14,6 +14,13 @@ ...@@ -14,6 +14,13 @@
14 14
15 using namespace std; 15 using namespace std;
16 16
17 +enum MODE {
18 + NONE = 0,
19 + PAPER = 1,
20 + COAUTHOR = 2,
21 +};
22 +const int mode = MODE::PAPER | MODE::COAUTHOR;
23 +
17 const char* DBLP_FILENAME = "dblp.json"; 24 const char* DBLP_FILENAME = "dblp.json";
18 const char* DBLP_COAUTHOR_FILENAME = "tmp_dblp_coauthorship.json"; 25 const char* DBLP_COAUTHOR_FILENAME = "tmp_dblp_coauthorship.json";
19 const string COLUMN_DELIMITER = "||"; 26 const string COLUMN_DELIMITER = "||";
...@@ -159,29 +166,148 @@ struct DblpPaperHandler { ...@@ -159,29 +166,148 @@ struct DblpPaperHandler {
159 } 166 }
160 }; 167 };
161 168
169 +struct CoauthorRecord {
170 + string author1, author2;
171 + unsigned int year;
162 172
163 -int main() { 173 + void write(ofstream& fout) {
164 - try { 174 + fout << author1 << COLUMN_DELIMITER
165 - ifstream dblp_paper_in, dblp_coauthor_in; 175 + << author2 << COLUMN_DELIMITER
166 - ofstream dblp_paper_out, dblp_coauthor_out; 176 + << year << endl;
167 - dblp_paper_in.open(DBLP_FILENAME); 177 + }
168 - dblp_paper_out.open((string(DBLP_FILENAME)+string(".out")).c_str()); 178 +};
169 - if (!dblp_paper_in || !dblp_paper_out) { 179 +
170 - throw exception("dblp paper file"); 180 +struct DblpCoauthorHandler {
181 + bool whole_array = false;
182 + bool is_record = false;
183 + int read_author_cnt = 0;
184 + uint64_t record_count = 0;
185 +
186 + CoauthorRecord coauthor;
187 + ofstream& ofs;
188 +
189 + DblpCoauthorHandler(ofstream& fout)
190 + : ofs(fout) {
191 + }
192 +
193 + //
194 + bool Null() {
195 + return true;
196 + }
197 + bool Bool(bool b) {
198 + //cout << "Bool(" << boolalpha << b << ")" << endl;
199 + return true;
200 + }
201 + bool Int(int i) {
202 + //cout << "Int(" << i << ")" << endl;
203 + return true;
204 + }
205 + bool Uint(unsigned u) {
206 + coauthor.year = u;
207 + return true;
208 + }
209 + bool Int64(int64_t i) {
210 + //cout << "Int64(" << i << ")" << endl;
211 + return true;
212 + }
213 + bool Uint64(uint64_t u) {
214 + //cout << "Uint64(" << u << ")" << endl;
215 + return true;
216 + }
217 + bool Double(double d) {
218 + //cout << "Double(" << d << ")" << endl;
219 + return true;
220 + }
221 + bool RawNumber(const char* str, rapidjson::SizeType length, bool copy) {
222 + //cout << "Number(" << str << ", " << length << ", " << boolalpha << copy << ")" << endl;
223 + return true;
224 + }
225 + bool String(const char* str, rapidjson::SizeType length, bool copy) {
226 + if (is_record) {
227 + if (read_author_cnt == 0) {
228 + coauthor.author1 = string(str);
229 + } else {
230 + coauthor.author2 = string(str);
231 + }
232 +
233 + ++read_author_cnt;
234 + }
235 + return true;
236 + }
237 + bool StartObject() {
238 + //cout << "StartObject()" << endl;
239 + return true;
240 + }
241 + bool Key(const char* str, rapidjson::SizeType length, bool copy) {
242 + //cout << "Key(" << str << ", " << length << ", " << boolalpha << copy << ")" << endl;
243 + return true;
244 + }
245 + bool EndObject(rapidjson::SizeType memberCount) {
246 + //cout << "EndObject(" << memberCount << ")" << endl;
247 + return true;
248 + }
249 + bool StartArray() {
250 + if (!whole_array) {
251 + whole_array = true;
252 + } else if (!is_record) {
253 + is_record = true;
254 + read_author_cnt = 0;
255 + }
256 + return true;
257 + }
258 + bool EndArray(rapidjson::SizeType elementCount) {
259 + if (is_record) {
260 + coauthor.write(ofs);
261 + is_record = false;
262 + ++record_count;
263 + if (record_count % 100000 == 0) {
264 + printf("* [%" PRIu64 "] \n", record_count);
265 + }
266 + } else {
267 + whole_array = false;
268 + printf("* total paper record: [%" PRIu64 "]\n", record_count);
171 } 269 }
270 + return true;
271 + }
272 +};
273 +
274 +
275 +int main(int argc, char* argv[]) {
276 + rapidjson::Reader reader;
277 + try {
278 + if (mode & MODE::PAPER) {
279 + ifstream dblp_paper_in;
280 + ofstream dblp_paper_out;
281 + dblp_paper_in.open(DBLP_FILENAME);
282 + dblp_paper_out.open((string(DBLP_FILENAME)+string(".out")).c_str());
283 + if (!dblp_paper_in || !dblp_paper_out) {
284 + throw exception("dblp paper file");
285 + }
172 286
173 - DblpPaperHandler paper_handler(dblp_paper_out); 287 + DblpPaperHandler paper_handler(dblp_paper_out);
288 + rapidjson::IStreamWrapper dblp_paper_isw(dblp_paper_in);
289 + reader.Parse(dblp_paper_isw, paper_handler);
174 290
175 - rapidjson::IStreamWrapper dblp_paper_isw(dblp_paper_in); 291 + if (dblp_paper_in) dblp_paper_in.close();
176 - rapidjson::Reader reader; 292 + if (dblp_paper_out) dblp_paper_out.close();
293 + }
177 294
178 - reader.Parse(dblp_paper_isw, paper_handler); 295 + if (mode & MODE::COAUTHOR) {
296 + ifstream dblp_coauthor_in;
297 + ofstream dblp_coauthor_out;
298 + dblp_coauthor_in.open(DBLP_COAUTHOR_FILENAME);
299 + dblp_coauthor_out.open((string(DBLP_COAUTHOR_FILENAME)+string(".out")).c_str());
300 + if (!dblp_coauthor_in || !dblp_coauthor_out) {
301 + throw exception("dblp coauthor file");
302 + }
179 303
180 - //release 304 + DblpCoauthorHandler coauthor_handler(dblp_coauthor_out);
181 - if (dblp_paper_in) dblp_paper_in.close(); 305 + rapidjson::IStreamWrapper dblp_coauthor_isw(dblp_coauthor_in);
182 - if (dblp_paper_out) dblp_paper_out.close(); 306 + reader.Parse(dblp_coauthor_isw, coauthor_handler);
183 - if (dblp_coauthor_in) dblp_coauthor_in.close(); 307 +
184 - if (dblp_coauthor_out) dblp_coauthor_out.close(); 308 + if (dblp_coauthor_in) dblp_coauthor_in.close();
309 + if (dblp_coauthor_out) dblp_coauthor_out.close();
310 + }
185 } 311 }
186 catch (const exception& e) { 312 catch (const exception& e) {
187 cerr << "Error: " << e.what() << endl; 313 cerr << "Error: " << e.what() << endl;
......