Showing
2 changed files
with
160 additions
and
83 deletions
... | @@ -71,19 +71,23 @@ | ... | @@ -71,19 +71,23 @@ |
71 | <PropertyGroup Label="UserMacros" /> | 71 | <PropertyGroup Label="UserMacros" /> |
72 | <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> | 72 | <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> |
73 | <LinkIncremental>true</LinkIncremental> | 73 | <LinkIncremental>true</LinkIncremental> |
74 | - <IncludePath>C:\boost\boost_1_62_0;C:\JsonCpp\jsoncpp-master\include;$(IncludePath)</IncludePath> | 74 | + <IncludePath>C:\boost\boost_1_62_0;C:\rapidjson\include;$(IncludePath)</IncludePath> |
75 | - <LibraryPath>C:\boost\boost_1_62_0\stage\lib;C:\JsonCpp\jsoncpp-master\build\vs71\debug\lib_json;$(LibraryPath)</LibraryPath> | 75 | + <LibraryPath>C:\boost\boost_1_62_0\stage\lib;$(LibraryPath)</LibraryPath> |
76 | </PropertyGroup> | 76 | </PropertyGroup> |
77 | <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> | 77 | <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
78 | <LinkIncremental>true</LinkIncremental> | 78 | <LinkIncremental>true</LinkIncremental> |
79 | - <IncludePath>C:\boost\boost_1_62_0;C:\JsonCpp\jsoncpp-master\include;$(IncludePath)</IncludePath> | 79 | + <IncludePath>C:\boost\boost_1_62_0;C:\rapidjson\include;$(IncludePath)</IncludePath> |
80 | - <LibraryPath>C:\boost\boost_1_62_0\stage\lib;C:\JsonCpp\jsoncpp-master\build\vs71\debug\lib_json;$(LibraryPath)</LibraryPath> | 80 | + <LibraryPath>C:\boost\boost_1_62_0\stage\lib;$(LibraryPath)</LibraryPath> |
81 | </PropertyGroup> | 81 | </PropertyGroup> |
82 | <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> | 82 | <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> |
83 | <LinkIncremental>false</LinkIncremental> | 83 | <LinkIncremental>false</LinkIncremental> |
84 | + <IncludePath>C:\rapidjson\include;C:\boost\boost_1_62_0;$(IncludePath)</IncludePath> | ||
85 | + <LibraryPath>C:\boost\boost_1_62_0\stage\lib;$(LibraryPath)</LibraryPath> | ||
84 | </PropertyGroup> | 86 | </PropertyGroup> |
85 | <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> | 87 | <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> |
86 | <LinkIncremental>false</LinkIncremental> | 88 | <LinkIncremental>false</LinkIncremental> |
89 | + <IncludePath>C:\rapidjson\include;C:\boost\boost_1_62_0;$(IncludePath)</IncludePath> | ||
90 | + <LibraryPath>C:\boost\boost_1_62_0\stage\lib;$(LibraryPath)</LibraryPath> | ||
87 | </PropertyGroup> | 91 | </PropertyGroup> |
88 | <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> | 92 | <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> |
89 | <ClCompile> | 93 | <ClCompile> |
... | @@ -99,7 +103,7 @@ | ... | @@ -99,7 +103,7 @@ |
99 | <SubSystem>Console</SubSystem> | 103 | <SubSystem>Console</SubSystem> |
100 | <GenerateDebugInformation>true</GenerateDebugInformation> | 104 | <GenerateDebugInformation>true</GenerateDebugInformation> |
101 | <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> | 105 | <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> |
102 | - <AdditionalDependencies>json_vc71_libmtd.lib;%(AdditionalDependencies)</AdditionalDependencies> | 106 | + <AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies> |
103 | </Link> | 107 | </Link> |
104 | </ItemDefinitionGroup> | 108 | </ItemDefinitionGroup> |
105 | <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> | 109 | <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> |
... | @@ -116,7 +120,7 @@ | ... | @@ -116,7 +120,7 @@ |
116 | <SubSystem>Console</SubSystem> | 120 | <SubSystem>Console</SubSystem> |
117 | <GenerateDebugInformation>true</GenerateDebugInformation> | 121 | <GenerateDebugInformation>true</GenerateDebugInformation> |
118 | <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> | 122 | <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> |
119 | - <AdditionalDependencies>json_vc71_libmtd.lib;%(AdditionalDependencies)</AdditionalDependencies> | 123 | + <AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies> |
120 | </Link> | 124 | </Link> |
121 | </ItemDefinitionGroup> | 125 | </ItemDefinitionGroup> |
122 | <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> | 126 | <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> | ... | ... |
1 | #include <cstdio> | 1 | #include <cstdio> |
2 | +#include <cinttypes> //PRId64 | ||
2 | #include <exception> | 3 | #include <exception> |
3 | #include <fstream> | 4 | #include <fstream> |
4 | #include <iostream> | 5 | #include <iostream> |
... | @@ -7,7 +8,9 @@ | ... | @@ -7,7 +8,9 @@ |
7 | 8 | ||
8 | #include <boost/regex.hpp> | 9 | #include <boost/regex.hpp> |
9 | 10 | ||
10 | -#include <json/json.h> | 11 | +#include <rapidjson/document.h> |
12 | +#include <rapidjson/istreamwrapper.h> | ||
13 | +#include <rapidjson/reader.h> | ||
11 | 14 | ||
12 | using namespace std; | 15 | using namespace std; |
13 | 16 | ||
... | @@ -15,105 +18,175 @@ const char* DBLP_FILENAME = "dblp.json"; | ... | @@ -15,105 +18,175 @@ const char* DBLP_FILENAME = "dblp.json"; |
15 | const char* DBLP_COAUTHOR_FILENAME = "tmp_dblp_coauthorship.json"; | 18 | const char* DBLP_COAUTHOR_FILENAME = "tmp_dblp_coauthorship.json"; |
16 | const string COLUMN_DELIMITER = "||"; | 19 | const string COLUMN_DELIMITER = "||"; |
17 | const string AUTHOR_DELIMITER = "&&"; | 20 | const string AUTHOR_DELIMITER = "&&"; |
21 | +boost::regex paper_reg{"(conf|journals).*"}; | ||
18 | 22 | ||
19 | -int main() { | ||
20 | - //init | ||
21 | - Json::Value root; | ||
22 | - Json::Reader reader; | ||
23 | - ifstream dblp_paper, dblp_coauthor; | ||
24 | - ofstream dblp_paper_out, dblp_coauthor_out; | ||
25 | - boost::regex paper_reg{"(conf|journals).*"}; | ||
26 | 23 | ||
27 | - try { | 24 | +struct PaperRecord { |
28 | - //1. dblp paper dataset | 25 | + string paper_key; |
29 | - dblp_paper.open(DBLP_FILENAME); | 26 | + vector<string> authors; |
30 | - dblp_paper_out.open((string(DBLP_FILENAME)+string(".out")).c_str()); | 27 | + unsigned int year; |
31 | - if (!dblp_paper || !dblp_paper_out) { | ||
32 | - throw exception("dblp paper file error"); | ||
33 | - } | ||
34 | - printf("* complete open\n"); | ||
35 | 28 | ||
36 | - if (!reader.parse(dblp_paper, root)) { | 29 | + void write(ofstream& fout) { |
37 | - throw exception("parse error"); | 30 | + fout << paper_key << COLUMN_DELIMITER; |
31 | + for (auto it=authors.begin(); it!=authors.end(); ++it) { | ||
32 | + if (it != authors.begin()) { | ||
33 | + fout << AUTHOR_DELIMITER; | ||
34 | + } | ||
35 | + fout << *it; | ||
38 | } | 36 | } |
39 | - printf("* complete parse\n"); | 37 | + fout << COLUMN_DELIMITER << year << endl; |
38 | + } | ||
39 | + void clear() { | ||
40 | + paper_key.clear(); | ||
41 | + authors.clear(); | ||
42 | + year = 1; | ||
43 | + } | ||
44 | + void add_author(string str) { | ||
45 | + authors.push_back(str); | ||
46 | + } | ||
47 | +}; | ||
40 | 48 | ||
49 | +struct DblpPaperHandler { | ||
50 | + bool whole_array = false; | ||
51 | + bool is_record = false; | ||
52 | + bool is_authors = false; | ||
53 | + bool is_paper = false; | ||
54 | + uint64_t record_count = 0; | ||
41 | 55 | ||
42 | - Json::Value row; | 56 | + PaperRecord paper; |
43 | - Json::Value coauthors; | 57 | + ofstream& ofs; |
44 | 58 | ||
45 | - std::string paper_key; | 59 | + DblpPaperHandler(ofstream& fout) |
46 | - std::vector<string> coauthor_list; | 60 | + : ofs(fout) { |
47 | - int year; | 61 | + } |
48 | 62 | ||
49 | - int count = 1; | 63 | + // |
50 | - for (auto it=root.begin(); | 64 | + bool Null() { |
51 | - it!=root.end(); | 65 | + return true; |
52 | - ++it) { | 66 | + } |
67 | + bool Bool(bool b) { | ||
68 | + //cout << "Bool(" << boolalpha << b << ")" << endl; | ||
69 | + return true; | ||
70 | + } | ||
71 | + bool Int(int i) { | ||
72 | + //cout << "Int(" << i << ")" << endl; | ||
73 | + return true; | ||
74 | + } | ||
75 | + bool Uint(unsigned u) { | ||
76 | + //cout << "Uint(" << u << ")" << endl; | ||
77 | + if (is_paper) { | ||
78 | + paper.year = u; | ||
79 | + } | ||
53 | 80 | ||
54 | - //Àüó¸® | 81 | + return true; |
55 | - row.clear(); | 82 | + } |
56 | - coauthors.clear(); | 83 | + bool Int64(int64_t i) { |
57 | - coauthor_list.clear(); | 84 | + //cout << "Int64(" << i << ")" << endl; |
85 | + return true; | ||
86 | + } | ||
87 | + bool Uint64(uint64_t u) { | ||
88 | + //cout << "Uint64(" << u << ")" << endl; | ||
89 | + return true; | ||
90 | + } | ||
91 | + bool Double(double d) { | ||
92 | + //cout << "Double(" << d << ")" << endl; | ||
93 | + return true; | ||
94 | + } | ||
95 | + bool RawNumber(const char* str, rapidjson::SizeType length, bool copy) { | ||
96 | + //cout << "Number(" << str << ", " << length << ", " << boolalpha << copy << ")" << endl; | ||
97 | + return true; | ||
98 | + } | ||
99 | + bool String(const char* str, rapidjson::SizeType length, bool copy) { | ||
100 | + if (is_record) { | ||
101 | + if (is_authors) { | ||
102 | + if (!is_paper) | ||
103 | + return true; | ||
58 | 104 | ||
59 | 105 | + paper.add_author(string(str)); | |
60 | - if (count%1000000 == 0) { | 106 | + } else { |
61 | - printf("* [%d]\n", count); | 107 | + if (boost::regex_match(str, paper_reg)) { |
108 | + is_paper = true; | ||
109 | + paper.paper_key = string(str); | ||
110 | + } | ||
62 | } | 111 | } |
63 | - | 112 | + } |
64 | - //row ´ÜÀ§·Î read | 113 | + return true; |
65 | - row = *it; | 114 | + } |
66 | - paper_key = row[0].asString(); | 115 | + bool StartObject() { |
67 | - | 116 | + //cout << "StartObject()" << endl; |
68 | - //check whether it is paper | 117 | + return true; |
69 | - if (boost::regex_match(paper_key, paper_reg)) { | 118 | + } |
70 | - coauthors = row[1]; | 119 | + bool Key(const char* str, rapidjson::SizeType length, bool copy) { |
71 | - for (auto coit=coauthors.begin(); coit!=coauthors.end(); ++coit) { | 120 | + //cout << "Key(" << str << ", " << length << ", " << boolalpha << copy << ")" << endl; |
72 | - coauthor_list.push_back(coit->asString()); | 121 | + return true; |
122 | + } | ||
123 | + bool EndObject(rapidjson::SizeType memberCount) { | ||
124 | + //cout << "EndObject(" << memberCount << ")" << endl; | ||
125 | + return true; | ||
126 | + } | ||
127 | + bool StartArray() { | ||
128 | + if (!whole_array) { | ||
129 | + whole_array = true; | ||
130 | + } else if (!is_record) { | ||
131 | + is_record = true; | ||
132 | + } else if (!is_authors) { | ||
133 | + is_authors = true; | ||
134 | + } | ||
135 | + return true; | ||
136 | + } | ||
137 | + bool EndArray(rapidjson::SizeType elementCount) { | ||
138 | + if (is_record) { | ||
139 | + if (is_authors) { | ||
140 | + is_authors = false; | ||
141 | + } else { | ||
142 | + if (is_paper) { | ||
143 | + paper.write(ofs); | ||
144 | + paper.clear(); | ||
73 | } | 145 | } |
74 | - year = ((row[2].isNull())?-1:row[2].asInt()); | 146 | + |
75 | - | 147 | + is_record = false; |
76 | - //write | 148 | + is_paper = false; |
77 | - dblp_paper_out << paper_key << COLUMN_DELIMITER; | 149 | + ++record_count; |
78 | - if (coauthor_list.size() > 0) { | 150 | + if (record_count % 100000 == 0) { |
79 | - for (auto auit=coauthor_list.begin(); auit!=coauthor_list.end();) { | 151 | + printf("* [%" PRIu64 "] \n", record_count); |
80 | - dblp_paper_out << (*auit); | ||
81 | - ++auit; | ||
82 | - if (auit != coauthor_list.end()) { | ||
83 | - dblp_paper_out << AUTHOR_DELIMITER; | ||
84 | - } | ||
85 | - } | ||
86 | - } else { | ||
87 | - //empty | ||
88 | - throw exception("paper without author"); | ||
89 | } | 152 | } |
90 | - dblp_paper_out << COLUMN_DELIMITER | ||
91 | - << year | ||
92 | - << endl; | ||
93 | - } else { | ||
94 | - //not paper | ||
95 | } | 153 | } |
154 | + } else { | ||
155 | + whole_array = false; | ||
156 | + printf("* total paper record: [%" PRIu64 "]\n", record_count); | ||
157 | + } | ||
158 | + return true; | ||
159 | + } | ||
160 | +}; | ||
161 | + | ||
96 | 162 | ||
97 | - //ÈÄó¸® | 163 | +int main() { |
98 | - ++count; | 164 | + try { |
165 | + ifstream dblp_paper_in, dblp_coauthor_in; | ||
166 | + ofstream dblp_paper_out, dblp_coauthor_out; | ||
167 | + dblp_paper_in.open(DBLP_FILENAME); | ||
168 | + dblp_paper_out.open((string(DBLP_FILENAME)+string(".out")).c_str()); | ||
169 | + if (!dblp_paper_in || !dblp_paper_out) { | ||
170 | + throw exception("dblp paper file"); | ||
99 | } | 171 | } |
100 | - printf("* complete convert dblp paper\n"); | 172 | + |
101 | - | 173 | + DblpPaperHandler paper_handler(dblp_paper_out); |
102 | 174 | ||
103 | - //2. dblp coauthorship dataset | 175 | + rapidjson::IStreamWrapper dblp_paper_isw(dblp_paper_in); |
104 | - //dblp_coauthor.open(DBLP_COAUTHOR_FILENAME); | 176 | + rapidjson::Reader reader; |
105 | 177 | ||
178 | + reader.Parse(dblp_paper_isw, paper_handler); | ||
179 | + | ||
180 | + //release | ||
181 | + if (dblp_paper_in) dblp_paper_in.close(); | ||
182 | + if (dblp_paper_out) dblp_paper_out.close(); | ||
183 | + if (dblp_coauthor_in) dblp_coauthor_in.close(); | ||
184 | + if (dblp_coauthor_out) dblp_coauthor_out.close(); | ||
106 | } | 185 | } |
107 | catch (const exception& e) { | 186 | catch (const exception& e) { |
108 | cerr << "Error: " << e.what() << endl; | 187 | cerr << "Error: " << e.what() << endl; |
109 | return -1; | 188 | return -1; |
110 | } | 189 | } |
111 | 190 | ||
112 | - | ||
113 | - //release | ||
114 | - if (dblp_paper) dblp_paper.close(); | ||
115 | - if (dblp_coauthor) dblp_coauthor.close(); | ||
116 | - if (dblp_paper_out) dblp_paper_out.close(); | ||
117 | - if (dblp_coauthor_out) dblp_coauthor_out.close(); | ||
118 | return 0; | 191 | return 0; |
119 | } | 192 | } |
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
-
Please register or login to post a comment