조성현

changed to RapidJson SAX ver

...@@ -71,19 +71,23 @@ ...@@ -71,19 +71,23 @@
71 <PropertyGroup Label="UserMacros" /> 71 <PropertyGroup Label="UserMacros" />
72 <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> 72 <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
73 <LinkIncremental>true</LinkIncremental> 73 <LinkIncremental>true</LinkIncremental>
74 - <IncludePath>C:\boost\boost_1_62_0;C:\JsonCpp\jsoncpp-master\include;$(IncludePath)</IncludePath> 74 + <IncludePath>C:\boost\boost_1_62_0;C:\rapidjson\include;$(IncludePath)</IncludePath>
75 - <LibraryPath>C:\boost\boost_1_62_0\stage\lib;C:\JsonCpp\jsoncpp-master\build\vs71\debug\lib_json;$(LibraryPath)</LibraryPath> 75 + <LibraryPath>C:\boost\boost_1_62_0\stage\lib;$(LibraryPath)</LibraryPath>
76 </PropertyGroup> 76 </PropertyGroup>
77 <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> 77 <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
78 <LinkIncremental>true</LinkIncremental> 78 <LinkIncremental>true</LinkIncremental>
79 - <IncludePath>C:\boost\boost_1_62_0;C:\JsonCpp\jsoncpp-master\include;$(IncludePath)</IncludePath> 79 + <IncludePath>C:\boost\boost_1_62_0;C:\rapidjson\include;$(IncludePath)</IncludePath>
80 - <LibraryPath>C:\boost\boost_1_62_0\stage\lib;C:\JsonCpp\jsoncpp-master\build\vs71\debug\lib_json;$(LibraryPath)</LibraryPath> 80 + <LibraryPath>C:\boost\boost_1_62_0\stage\lib;$(LibraryPath)</LibraryPath>
81 </PropertyGroup> 81 </PropertyGroup>
82 <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> 82 <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
83 <LinkIncremental>false</LinkIncremental> 83 <LinkIncremental>false</LinkIncremental>
84 + <IncludePath>C:\rapidjson\include;C:\boost\boost_1_62_0;$(IncludePath)</IncludePath>
85 + <LibraryPath>C:\boost\boost_1_62_0\stage\lib;$(LibraryPath)</LibraryPath>
84 </PropertyGroup> 86 </PropertyGroup>
85 <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> 87 <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
86 <LinkIncremental>false</LinkIncremental> 88 <LinkIncremental>false</LinkIncremental>
89 + <IncludePath>C:\rapidjson\include;C:\boost\boost_1_62_0;$(IncludePath)</IncludePath>
90 + <LibraryPath>C:\boost\boost_1_62_0\stage\lib;$(LibraryPath)</LibraryPath>
87 </PropertyGroup> 91 </PropertyGroup>
88 <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> 92 <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
89 <ClCompile> 93 <ClCompile>
...@@ -99,7 +103,7 @@ ...@@ -99,7 +103,7 @@
99 <SubSystem>Console</SubSystem> 103 <SubSystem>Console</SubSystem>
100 <GenerateDebugInformation>true</GenerateDebugInformation> 104 <GenerateDebugInformation>true</GenerateDebugInformation>
101 <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> 105 <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
102 - <AdditionalDependencies>json_vc71_libmtd.lib;%(AdditionalDependencies)</AdditionalDependencies> 106 + <AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
103 </Link> 107 </Link>
104 </ItemDefinitionGroup> 108 </ItemDefinitionGroup>
105 <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> 109 <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
...@@ -116,7 +120,7 @@ ...@@ -116,7 +120,7 @@
116 <SubSystem>Console</SubSystem> 120 <SubSystem>Console</SubSystem>
117 <GenerateDebugInformation>true</GenerateDebugInformation> 121 <GenerateDebugInformation>true</GenerateDebugInformation>
118 <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> 122 <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
119 - <AdditionalDependencies>json_vc71_libmtd.lib;%(AdditionalDependencies)</AdditionalDependencies> 123 + <AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
120 </Link> 124 </Link>
121 </ItemDefinitionGroup> 125 </ItemDefinitionGroup>
122 <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> 126 <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
......
1 #include <cstdio> 1 #include <cstdio>
2 +#include <cinttypes> //PRId64
2 #include <exception> 3 #include <exception>
3 #include <fstream> 4 #include <fstream>
4 #include <iostream> 5 #include <iostream>
...@@ -7,7 +8,9 @@ ...@@ -7,7 +8,9 @@
7 8
8 #include <boost/regex.hpp> 9 #include <boost/regex.hpp>
9 10
10 -#include <json/json.h> 11 +#include <rapidjson/document.h>
12 +#include <rapidjson/istreamwrapper.h>
13 +#include <rapidjson/reader.h>
11 14
12 using namespace std; 15 using namespace std;
13 16
...@@ -15,105 +18,175 @@ const char* DBLP_FILENAME = "dblp.json"; ...@@ -15,105 +18,175 @@ const char* DBLP_FILENAME = "dblp.json";
15 const char* DBLP_COAUTHOR_FILENAME = "tmp_dblp_coauthorship.json"; 18 const char* DBLP_COAUTHOR_FILENAME = "tmp_dblp_coauthorship.json";
16 const string COLUMN_DELIMITER = "||"; 19 const string COLUMN_DELIMITER = "||";
17 const string AUTHOR_DELIMITER = "&&"; 20 const string AUTHOR_DELIMITER = "&&";
21 +boost::regex paper_reg{"(conf|journals).*"};
18 22
19 -int main() {
20 - //init
21 - Json::Value root;
22 - Json::Reader reader;
23 - ifstream dblp_paper, dblp_coauthor;
24 - ofstream dblp_paper_out, dblp_coauthor_out;
25 - boost::regex paper_reg{"(conf|journals).*"};
26 23
27 - try { 24 +struct PaperRecord {
28 - //1. dblp paper dataset 25 + string paper_key;
29 - dblp_paper.open(DBLP_FILENAME); 26 + vector<string> authors;
30 - dblp_paper_out.open((string(DBLP_FILENAME)+string(".out")).c_str()); 27 + unsigned int year;
31 - if (!dblp_paper || !dblp_paper_out) {
32 - throw exception("dblp paper file error");
33 - }
34 - printf("* complete open\n");
35 28
36 - if (!reader.parse(dblp_paper, root)) { 29 + void write(ofstream& fout) {
37 - throw exception("parse error"); 30 + fout << paper_key << COLUMN_DELIMITER;
31 + for (auto it=authors.begin(); it!=authors.end(); ++it) {
32 + if (it != authors.begin()) {
33 + fout << AUTHOR_DELIMITER;
34 + }
35 + fout << *it;
38 } 36 }
39 - printf("* complete parse\n"); 37 + fout << COLUMN_DELIMITER << year << endl;
38 + }
39 + void clear() {
40 + paper_key.clear();
41 + authors.clear();
42 + year = 1;
43 + }
44 + void add_author(string str) {
45 + authors.push_back(str);
46 + }
47 +};
40 48
49 +struct DblpPaperHandler {
50 + bool whole_array = false;
51 + bool is_record = false;
52 + bool is_authors = false;
53 + bool is_paper = false;
54 + uint64_t record_count = 0;
41 55
42 - Json::Value row; 56 + PaperRecord paper;
43 - Json::Value coauthors; 57 + ofstream& ofs;
44 58
45 - std::string paper_key; 59 + DblpPaperHandler(ofstream& fout)
46 - std::vector<string> coauthor_list; 60 + : ofs(fout) {
47 - int year; 61 + }
48 62
49 - int count = 1; 63 + //
50 - for (auto it=root.begin(); 64 + bool Null() {
51 - it!=root.end(); 65 + return true;
52 - ++it) { 66 + }
67 + bool Bool(bool b) {
68 + //cout << "Bool(" << boolalpha << b << ")" << endl;
69 + return true;
70 + }
71 + bool Int(int i) {
72 + //cout << "Int(" << i << ")" << endl;
73 + return true;
74 + }
75 + bool Uint(unsigned u) {
76 + //cout << "Uint(" << u << ")" << endl;
77 + if (is_paper) {
78 + paper.year = u;
79 + }
53 80
54 - //Àüó¸® 81 + return true;
55 - row.clear(); 82 + }
56 - coauthors.clear(); 83 + bool Int64(int64_t i) {
57 - coauthor_list.clear(); 84 + //cout << "Int64(" << i << ")" << endl;
85 + return true;
86 + }
87 + bool Uint64(uint64_t u) {
88 + //cout << "Uint64(" << u << ")" << endl;
89 + return true;
90 + }
91 + bool Double(double d) {
92 + //cout << "Double(" << d << ")" << endl;
93 + return true;
94 + }
95 + bool RawNumber(const char* str, rapidjson::SizeType length, bool copy) {
96 + //cout << "Number(" << str << ", " << length << ", " << boolalpha << copy << ")" << endl;
97 + return true;
98 + }
99 + bool String(const char* str, rapidjson::SizeType length, bool copy) {
100 + if (is_record) {
101 + if (is_authors) {
102 + if (!is_paper)
103 + return true;
58 104
59 - //print 105 + paper.add_author(string(str));
60 - if (count%1000000 == 0) { 106 + } else {
61 - printf("* [%d]\n", count); 107 + if (boost::regex_match(str, paper_reg)) {
108 + is_paper = true;
109 + paper.paper_key = string(str);
110 + }
62 } 111 }
63 - 112 + }
64 - //row ´ÜÀ§·Î read 113 + return true;
65 - row = *it; 114 + }
66 - paper_key = row[0].asString(); 115 + bool StartObject() {
67 - 116 + //cout << "StartObject()" << endl;
68 - //check whether it is paper 117 + return true;
69 - if (boost::regex_match(paper_key, paper_reg)) { 118 + }
70 - coauthors = row[1]; 119 + bool Key(const char* str, rapidjson::SizeType length, bool copy) {
71 - for (auto coit=coauthors.begin(); coit!=coauthors.end(); ++coit) { 120 + //cout << "Key(" << str << ", " << length << ", " << boolalpha << copy << ")" << endl;
72 - coauthor_list.push_back(coit->asString()); 121 + return true;
122 + }
123 + bool EndObject(rapidjson::SizeType memberCount) {
124 + //cout << "EndObject(" << memberCount << ")" << endl;
125 + return true;
126 + }
127 + bool StartArray() {
128 + if (!whole_array) {
129 + whole_array = true;
130 + } else if (!is_record) {
131 + is_record = true;
132 + } else if (!is_authors) {
133 + is_authors = true;
134 + }
135 + return true;
136 + }
137 + bool EndArray(rapidjson::SizeType elementCount) {
138 + if (is_record) {
139 + if (is_authors) {
140 + is_authors = false;
141 + } else {
142 + if (is_paper) {
143 + paper.write(ofs);
144 + paper.clear();
73 } 145 }
74 - year = ((row[2].isNull())?-1:row[2].asInt()); 146 +
75 - 147 + is_record = false;
76 - //write 148 + is_paper = false;
77 - dblp_paper_out << paper_key << COLUMN_DELIMITER; 149 + ++record_count;
78 - if (coauthor_list.size() > 0) { 150 + if (record_count % 100000 == 0) {
79 - for (auto auit=coauthor_list.begin(); auit!=coauthor_list.end();) { 151 + printf("* [%" PRIu64 "] \n", record_count);
80 - dblp_paper_out << (*auit);
81 - ++auit;
82 - if (auit != coauthor_list.end()) {
83 - dblp_paper_out << AUTHOR_DELIMITER;
84 - }
85 - }
86 - } else {
87 - //empty
88 - throw exception("paper without author");
89 } 152 }
90 - dblp_paper_out << COLUMN_DELIMITER
91 - << year
92 - << endl;
93 - } else {
94 - //not paper
95 } 153 }
154 + } else {
155 + whole_array = false;
156 + printf("* total paper record: [%" PRIu64 "]\n", record_count);
157 + }
158 + return true;
159 + }
160 +};
161 +
96 162
97 - //ÈÄó¸® 163 +int main() {
98 - ++count; 164 + try {
165 + ifstream dblp_paper_in, dblp_coauthor_in;
166 + ofstream dblp_paper_out, dblp_coauthor_out;
167 + dblp_paper_in.open(DBLP_FILENAME);
168 + dblp_paper_out.open((string(DBLP_FILENAME)+string(".out")).c_str());
169 + if (!dblp_paper_in || !dblp_paper_out) {
170 + throw exception("dblp paper file");
99 } 171 }
100 - printf("* complete convert dblp paper\n"); 172 +
101 - 173 + DblpPaperHandler paper_handler(dblp_paper_out);
102 174
103 - //2. dblp coauthorship dataset 175 + rapidjson::IStreamWrapper dblp_paper_isw(dblp_paper_in);
104 - //dblp_coauthor.open(DBLP_COAUTHOR_FILENAME); 176 + rapidjson::Reader reader;
105 177
178 + reader.Parse(dblp_paper_isw, paper_handler);
179 +
180 + //release
181 + if (dblp_paper_in) dblp_paper_in.close();
182 + if (dblp_paper_out) dblp_paper_out.close();
183 + if (dblp_coauthor_in) dblp_coauthor_in.close();
184 + if (dblp_coauthor_out) dblp_coauthor_out.close();
106 } 185 }
107 catch (const exception& e) { 186 catch (const exception& e) {
108 cerr << "Error: " << e.what() << endl; 187 cerr << "Error: " << e.what() << endl;
109 return -1; 188 return -1;
110 } 189 }
111 190
112 -
113 - //release
114 - if (dblp_paper) dblp_paper.close();
115 - if (dblp_coauthor) dblp_coauthor.close();
116 - if (dblp_paper_out) dblp_paper_out.close();
117 - if (dblp_coauthor_out) dblp_coauthor_out.close();
118 return 0; 191 return 0;
119 } 192 }
...\ No newline at end of file ...\ No newline at end of file
......