조성현

added coauthor parser

...@@ -14,6 +14,13 @@ ...@@ -14,6 +14,13 @@
14 14
15 using namespace std; 15 using namespace std;
16 16
17 +enum MODE {
18 + NONE = 0,
19 + PAPER = 1,
20 + COAUTHOR = 2,
21 +};
22 +const int mode = MODE::PAPER | MODE::COAUTHOR;
23 +
17 const char* DBLP_FILENAME = "dblp.json"; 24 const char* DBLP_FILENAME = "dblp.json";
18 const char* DBLP_COAUTHOR_FILENAME = "tmp_dblp_coauthorship.json"; 25 const char* DBLP_COAUTHOR_FILENAME = "tmp_dblp_coauthorship.json";
19 const string COLUMN_DELIMITER = "||"; 26 const string COLUMN_DELIMITER = "||";
...@@ -159,11 +166,118 @@ struct DblpPaperHandler { ...@@ -159,11 +166,118 @@ struct DblpPaperHandler {
159 } 166 }
160 }; 167 };
161 168
169 +struct CoauthorRecord {
170 + string author1, author2;
171 + unsigned int year;
172 +
173 + void write(ofstream& fout) {
174 + fout << author1 << COLUMN_DELIMITER
175 + << author2 << COLUMN_DELIMITER
176 + << year << endl;
177 + }
178 +};
179 +
180 +struct DblpCoauthorHandler {
181 + bool whole_array = false;
182 + bool is_record = false;
183 + int read_author_cnt = 0;
184 + uint64_t record_count = 0;
185 +
186 + CoauthorRecord coauthor;
187 + ofstream& ofs;
188 +
189 + DblpCoauthorHandler(ofstream& fout)
190 + : ofs(fout) {
191 + }
162 192
163 -int main() { 193 + //
194 + bool Null() {
195 + return true;
196 + }
197 + bool Bool(bool b) {
198 + //cout << "Bool(" << boolalpha << b << ")" << endl;
199 + return true;
200 + }
201 + bool Int(int i) {
202 + //cout << "Int(" << i << ")" << endl;
203 + return true;
204 + }
205 + bool Uint(unsigned u) {
206 + coauthor.year = u;
207 + return true;
208 + }
209 + bool Int64(int64_t i) {
210 + //cout << "Int64(" << i << ")" << endl;
211 + return true;
212 + }
213 + bool Uint64(uint64_t u) {
214 + //cout << "Uint64(" << u << ")" << endl;
215 + return true;
216 + }
217 + bool Double(double d) {
218 + //cout << "Double(" << d << ")" << endl;
219 + return true;
220 + }
221 + bool RawNumber(const char* str, rapidjson::SizeType length, bool copy) {
222 + //cout << "Number(" << str << ", " << length << ", " << boolalpha << copy << ")" << endl;
223 + return true;
224 + }
225 + bool String(const char* str, rapidjson::SizeType length, bool copy) {
226 + if (is_record) {
227 + if (read_author_cnt == 0) {
228 + coauthor.author1 = string(str);
229 + } else {
230 + coauthor.author2 = string(str);
231 + }
232 +
233 + ++read_author_cnt;
234 + }
235 + return true;
236 + }
237 + bool StartObject() {
238 + //cout << "StartObject()" << endl;
239 + return true;
240 + }
241 + bool Key(const char* str, rapidjson::SizeType length, bool copy) {
242 + //cout << "Key(" << str << ", " << length << ", " << boolalpha << copy << ")" << endl;
243 + return true;
244 + }
245 + bool EndObject(rapidjson::SizeType memberCount) {
246 + //cout << "EndObject(" << memberCount << ")" << endl;
247 + return true;
248 + }
249 + bool StartArray() {
250 + if (!whole_array) {
251 + whole_array = true;
252 + } else if (!is_record) {
253 + is_record = true;
254 + read_author_cnt = 0;
255 + }
256 + return true;
257 + }
258 + bool EndArray(rapidjson::SizeType elementCount) {
259 + if (is_record) {
260 + coauthor.write(ofs);
261 + is_record = false;
262 + ++record_count;
263 + if (record_count % 100000 == 0) {
264 + printf("* [%" PRIu64 "] \n", record_count);
265 + }
266 + } else {
267 + whole_array = false;
268 + printf("* total paper record: [%" PRIu64 "]\n", record_count);
269 + }
270 + return true;
271 + }
272 +};
273 +
274 +
275 +int main(int argc, char* argv[]) {
276 + rapidjson::Reader reader;
164 try { 277 try {
165 - ifstream dblp_paper_in, dblp_coauthor_in; 278 + if (mode & MODE::PAPER) {
166 - ofstream dblp_paper_out, dblp_coauthor_out; 279 + ifstream dblp_paper_in;
280 + ofstream dblp_paper_out;
167 dblp_paper_in.open(DBLP_FILENAME); 281 dblp_paper_in.open(DBLP_FILENAME);
168 dblp_paper_out.open((string(DBLP_FILENAME)+string(".out")).c_str()); 282 dblp_paper_out.open((string(DBLP_FILENAME)+string(".out")).c_str());
169 if (!dblp_paper_in || !dblp_paper_out) { 283 if (!dblp_paper_in || !dblp_paper_out) {
...@@ -171,18 +285,30 @@ int main() { ...@@ -171,18 +285,30 @@ int main() {
171 } 285 }
172 286
173 DblpPaperHandler paper_handler(dblp_paper_out); 287 DblpPaperHandler paper_handler(dblp_paper_out);
174 -
175 rapidjson::IStreamWrapper dblp_paper_isw(dblp_paper_in); 288 rapidjson::IStreamWrapper dblp_paper_isw(dblp_paper_in);
176 - rapidjson::Reader reader;
177 -
178 reader.Parse(dblp_paper_isw, paper_handler); 289 reader.Parse(dblp_paper_isw, paper_handler);
179 290
180 - //release
181 if (dblp_paper_in) dblp_paper_in.close(); 291 if (dblp_paper_in) dblp_paper_in.close();
182 if (dblp_paper_out) dblp_paper_out.close(); 292 if (dblp_paper_out) dblp_paper_out.close();
293 + }
294 +
295 + if (mode & MODE::COAUTHOR) {
296 + ifstream dblp_coauthor_in;
297 + ofstream dblp_coauthor_out;
298 + dblp_coauthor_in.open(DBLP_COAUTHOR_FILENAME);
299 + dblp_coauthor_out.open((string(DBLP_COAUTHOR_FILENAME)+string(".out")).c_str());
300 + if (!dblp_coauthor_in || !dblp_coauthor_out) {
301 + throw exception("dblp coauthor file");
302 + }
303 +
304 + DblpCoauthorHandler coauthor_handler(dblp_coauthor_out);
305 + rapidjson::IStreamWrapper dblp_coauthor_isw(dblp_coauthor_in);
306 + reader.Parse(dblp_coauthor_isw, coauthor_handler);
307 +
183 if (dblp_coauthor_in) dblp_coauthor_in.close(); 308 if (dblp_coauthor_in) dblp_coauthor_in.close();
184 if (dblp_coauthor_out) dblp_coauthor_out.close(); 309 if (dblp_coauthor_out) dblp_coauthor_out.close();
185 } 310 }
311 + }
186 catch (const exception& e) { 312 catch (const exception& e) {
187 cerr << "Error: " << e.what() << endl; 313 cerr << "Error: " << e.what() << endl;
188 return -1; 314 return -1;
......