조성현

changed to RapidJson SAX ver

......@@ -71,19 +71,23 @@
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<LinkIncremental>true</LinkIncremental>
<IncludePath>C:\boost\boost_1_62_0;C:\JsonCpp\jsoncpp-master\include;$(IncludePath)</IncludePath>
<LibraryPath>C:\boost\boost_1_62_0\stage\lib;C:\JsonCpp\jsoncpp-master\build\vs71\debug\lib_json;$(LibraryPath)</LibraryPath>
<IncludePath>C:\boost\boost_1_62_0;C:\rapidjson\include;$(IncludePath)</IncludePath>
<LibraryPath>C:\boost\boost_1_62_0\stage\lib;$(LibraryPath)</LibraryPath>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
<IncludePath>C:\boost\boost_1_62_0;C:\JsonCpp\jsoncpp-master\include;$(IncludePath)</IncludePath>
<LibraryPath>C:\boost\boost_1_62_0\stage\lib;C:\JsonCpp\jsoncpp-master\build\vs71\debug\lib_json;$(LibraryPath)</LibraryPath>
<IncludePath>C:\boost\boost_1_62_0;C:\rapidjson\include;$(IncludePath)</IncludePath>
<LibraryPath>C:\boost\boost_1_62_0\stage\lib;$(LibraryPath)</LibraryPath>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<LinkIncremental>false</LinkIncremental>
<IncludePath>C:\rapidjson\include;C:\boost\boost_1_62_0;$(IncludePath)</IncludePath>
<LibraryPath>C:\boost\boost_1_62_0\stage\lib;$(LibraryPath)</LibraryPath>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
<IncludePath>C:\rapidjson\include;C:\boost\boost_1_62_0;$(IncludePath)</IncludePath>
<LibraryPath>C:\boost\boost_1_62_0\stage\lib;$(LibraryPath)</LibraryPath>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile>
......@@ -99,7 +103,7 @@
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
<AdditionalDependencies>json_vc71_libmtd.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
......@@ -116,7 +120,7 @@
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
<AdditionalDependencies>json_vc71_libmtd.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
......
#include <cstdio>
#include <cinttypes> //PRId64
#include <exception>
#include <fstream>
#include <iostream>
......@@ -7,7 +8,9 @@
#include <boost/regex.hpp>
#include <json/json.h>
#include <rapidjson/document.h>
#include <rapidjson/istreamwrapper.h>
#include <rapidjson/reader.h>
using namespace std;
......@@ -15,105 +18,175 @@ const char* DBLP_FILENAME = "dblp.json";
const char* DBLP_COAUTHOR_FILENAME = "tmp_dblp_coauthorship.json";
const string COLUMN_DELIMITER = "||";
const string AUTHOR_DELIMITER = "&&";
boost::regex paper_reg{"(conf|journals).*"};
int main() {
//init
Json::Value root;
Json::Reader reader;
ifstream dblp_paper, dblp_coauthor;
ofstream dblp_paper_out, dblp_coauthor_out;
boost::regex paper_reg{"(conf|journals).*"};
try {
//1. dblp paper dataset
dblp_paper.open(DBLP_FILENAME);
dblp_paper_out.open((string(DBLP_FILENAME)+string(".out")).c_str());
if (!dblp_paper || !dblp_paper_out) {
throw exception("dblp paper file error");
}
printf("* complete open\n");
struct PaperRecord {
string paper_key;
vector<string> authors;
unsigned int year;
if (!reader.parse(dblp_paper, root)) {
throw exception("parse error");
void write(ofstream& fout) {
fout << paper_key << COLUMN_DELIMITER;
for (auto it=authors.begin(); it!=authors.end(); ++it) {
if (it != authors.begin()) {
fout << AUTHOR_DELIMITER;
}
fout << *it;
}
printf("* complete parse\n");
fout << COLUMN_DELIMITER << year << endl;
}
void clear() {
paper_key.clear();
authors.clear();
year = 1;
}
void add_author(string str) {
authors.push_back(str);
}
};
struct DblpPaperHandler {
bool whole_array = false;
bool is_record = false;
bool is_authors = false;
bool is_paper = false;
uint64_t record_count = 0;
Json::Value row;
Json::Value coauthors;
PaperRecord paper;
ofstream& ofs;
std::string paper_key;
std::vector<string> coauthor_list;
int year;
DblpPaperHandler(ofstream& fout)
: ofs(fout) {
}
int count = 1;
for (auto it=root.begin();
it!=root.end();
++it) {
//
bool Null() {
return true;
}
bool Bool(bool b) {
//cout << "Bool(" << boolalpha << b << ")" << endl;
return true;
}
bool Int(int i) {
//cout << "Int(" << i << ")" << endl;
return true;
}
bool Uint(unsigned u) {
//cout << "Uint(" << u << ")" << endl;
if (is_paper) {
paper.year = u;
}
//Àüó¸®
row.clear();
coauthors.clear();
coauthor_list.clear();
return true;
}
bool Int64(int64_t i) {
//cout << "Int64(" << i << ")" << endl;
return true;
}
bool Uint64(uint64_t u) {
//cout << "Uint64(" << u << ")" << endl;
return true;
}
bool Double(double d) {
//cout << "Double(" << d << ")" << endl;
return true;
}
bool RawNumber(const char* str, rapidjson::SizeType length, bool copy) {
//cout << "Number(" << str << ", " << length << ", " << boolalpha << copy << ")" << endl;
return true;
}
bool String(const char* str, rapidjson::SizeType length, bool copy) {
if (is_record) {
if (is_authors) {
if (!is_paper)
return true;
//print
if (count%1000000 == 0) {
printf("* [%d]\n", count);
paper.add_author(string(str));
} else {
if (boost::regex_match(str, paper_reg)) {
is_paper = true;
paper.paper_key = string(str);
}
}
//row ´ÜÀ§·Î read
row = *it;
paper_key = row[0].asString();
//check whether it is paper
if (boost::regex_match(paper_key, paper_reg)) {
coauthors = row[1];
for (auto coit=coauthors.begin(); coit!=coauthors.end(); ++coit) {
coauthor_list.push_back(coit->asString());
}
return true;
}
bool StartObject() {
//cout << "StartObject()" << endl;
return true;
}
bool Key(const char* str, rapidjson::SizeType length, bool copy) {
//cout << "Key(" << str << ", " << length << ", " << boolalpha << copy << ")" << endl;
return true;
}
bool EndObject(rapidjson::SizeType memberCount) {
//cout << "EndObject(" << memberCount << ")" << endl;
return true;
}
bool StartArray() {
if (!whole_array) {
whole_array = true;
} else if (!is_record) {
is_record = true;
} else if (!is_authors) {
is_authors = true;
}
return true;
}
bool EndArray(rapidjson::SizeType elementCount) {
if (is_record) {
if (is_authors) {
is_authors = false;
} else {
if (is_paper) {
paper.write(ofs);
paper.clear();
}
year = ((row[2].isNull())?-1:row[2].asInt());
//write
dblp_paper_out << paper_key << COLUMN_DELIMITER;
if (coauthor_list.size() > 0) {
for (auto auit=coauthor_list.begin(); auit!=coauthor_list.end();) {
dblp_paper_out << (*auit);
++auit;
if (auit != coauthor_list.end()) {
dblp_paper_out << AUTHOR_DELIMITER;
}
}
} else {
//empty
throw exception("paper without author");
is_record = false;
is_paper = false;
++record_count;
if (record_count % 100000 == 0) {
printf("* [%" PRIu64 "] \n", record_count);
}
dblp_paper_out << COLUMN_DELIMITER
<< year
<< endl;
} else {
//not paper
}
} else {
whole_array = false;
printf("* total paper record: [%" PRIu64 "]\n", record_count);
}
return true;
}
};
//ÈÄó¸®
++count;
int main() {
try {
ifstream dblp_paper_in, dblp_coauthor_in;
ofstream dblp_paper_out, dblp_coauthor_out;
dblp_paper_in.open(DBLP_FILENAME);
dblp_paper_out.open((string(DBLP_FILENAME)+string(".out")).c_str());
if (!dblp_paper_in || !dblp_paper_out) {
throw exception("dblp paper file");
}
printf("* complete convert dblp paper\n");
DblpPaperHandler paper_handler(dblp_paper_out);
//2. dblp coauthorship dataset
//dblp_coauthor.open(DBLP_COAUTHOR_FILENAME);
rapidjson::IStreamWrapper dblp_paper_isw(dblp_paper_in);
rapidjson::Reader reader;
reader.Parse(dblp_paper_isw, paper_handler);
//release
if (dblp_paper_in) dblp_paper_in.close();
if (dblp_paper_out) dblp_paper_out.close();
if (dblp_coauthor_in) dblp_coauthor_in.close();
if (dblp_coauthor_out) dblp_coauthor_out.close();
}
catch (const exception& e) {
cerr << "Error: " << e.what() << endl;
return -1;
}
//release
if (dblp_paper) dblp_paper.close();
if (dblp_coauthor) dblp_coauthor.close();
if (dblp_paper_out) dblp_paper_out.close();
if (dblp_coauthor_out) dblp_coauthor_out.close();
return 0;
}
\ No newline at end of file
......