Toggle navigation
Toggle navigation
This project
Loading...
Sign in
조성현
/
csh-parser
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
조성현
2017-03-18 18:08:28 +0900
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
1108d0da2e8cc766c3452ea636202bf8cedc83c8
1108d0da
1 parent
7f25cd60
changed to RapidJson SAX ver
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
160 additions
and
83 deletions
dblpJsonParser/dblpJsonParser.vcxproj
dblpJsonParser/main.cpp
dblpJsonParser/dblpJsonParser.vcxproj
View file @
1108d0d
...
...
@@ -71,19 +71,23 @@
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<LinkIncremental>true</LinkIncremental>
<IncludePath>C:\boost\boost_1_62_0;C:\
JsonCpp\jsoncpp-master
\include;$(IncludePath)</IncludePath>
<LibraryPath>C:\boost\boost_1_62_0\stage\lib;
C:\JsonCpp\jsoncpp-master\build\vs71\debug\lib_json;
$(LibraryPath)</LibraryPath>
<IncludePath>C:\boost\boost_1_62_0;C:\
rapidjson
\include;$(IncludePath)</IncludePath>
<LibraryPath>C:\boost\boost_1_62_0\stage\lib;$(LibraryPath)</LibraryPath>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
<IncludePath>C:\boost\boost_1_62_0;C:\
JsonCpp\jsoncpp-master
\include;$(IncludePath)</IncludePath>
<LibraryPath>C:\boost\boost_1_62_0\stage\lib;
C:\JsonCpp\jsoncpp-master\build\vs71\debug\lib_json;
$(LibraryPath)</LibraryPath>
<IncludePath>C:\boost\boost_1_62_0;C:\
rapidjson
\include;$(IncludePath)</IncludePath>
<LibraryPath>C:\boost\boost_1_62_0\stage\lib;$(LibraryPath)</LibraryPath>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<LinkIncremental>false</LinkIncremental>
<IncludePath>C:\rapidjson\include;C:\boost\boost_1_62_0;$(IncludePath)</IncludePath>
<LibraryPath>C:\boost\boost_1_62_0\stage\lib;$(LibraryPath)</LibraryPath>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
<IncludePath>C:\rapidjson\include;C:\boost\boost_1_62_0;$(IncludePath)</IncludePath>
<LibraryPath>C:\boost\boost_1_62_0\stage\lib;$(LibraryPath)</LibraryPath>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile>
...
...
@@ -99,7 +103,7 @@
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
<AdditionalDependencies>
json_vc71_libmtd.lib;
%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
...
...
@@ -116,7 +120,7 @@
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
<AdditionalDependencies>
json_vc71_libmtd.lib;
%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
...
...
dblpJsonParser/main.cpp
View file @
1108d0d
#include <cstdio>
#include <cinttypes> //PRId64
#include <exception>
#include <fstream>
#include <iostream>
...
...
@@ -7,7 +8,9 @@
#include <boost/regex.hpp>
#include <json/json.h>
#include <rapidjson/document.h>
#include <rapidjson/istreamwrapper.h>
#include <rapidjson/reader.h>
using
namespace
std
;
...
...
@@ -15,105 +18,175 @@ const char* DBLP_FILENAME = "dblp.json";
const
char
*
DBLP_COAUTHOR_FILENAME
=
"tmp_dblp_coauthorship.json"
;
const
string
COLUMN_DELIMITER
=
"||"
;
const
string
AUTHOR_DELIMITER
=
"&&"
;
boost
::
regex
paper_reg
{
"(conf|journals).*"
};
int
main
()
{
//init
Json
::
Value
root
;
Json
::
Reader
reader
;
ifstream
dblp_paper
,
dblp_coauthor
;
ofstream
dblp_paper_out
,
dblp_coauthor_out
;
boost
::
regex
paper_reg
{
"(conf|journals).*"
};
try
{
//1. dblp paper dataset
dblp_paper
.
open
(
DBLP_FILENAME
);
dblp_paper_out
.
open
((
string
(
DBLP_FILENAME
)
+
string
(
".out"
)).
c_str
());
if
(
!
dblp_paper
||
!
dblp_paper_out
)
{
throw
exception
(
"dblp paper file error"
);
}
printf
(
"* complete open
\n
"
);
struct
PaperRecord
{
string
paper_key
;
vector
<
string
>
authors
;
unsigned
int
year
;
if
(
!
reader
.
parse
(
dblp_paper
,
root
))
{
throw
exception
(
"parse error"
);
void
write
(
ofstream
&
fout
)
{
fout
<<
paper_key
<<
COLUMN_DELIMITER
;
for
(
auto
it
=
authors
.
begin
();
it
!=
authors
.
end
();
++
it
)
{
if
(
it
!=
authors
.
begin
())
{
fout
<<
AUTHOR_DELIMITER
;
}
fout
<<
*
it
;
}
printf
(
"* complete parse
\n
"
);
fout
<<
COLUMN_DELIMITER
<<
year
<<
endl
;
}
void
clear
()
{
paper_key
.
clear
();
authors
.
clear
();
year
=
1
;
}
void
add_author
(
string
str
)
{
authors
.
push_back
(
str
);
}
};
struct
DblpPaperHandler
{
bool
whole_array
=
false
;
bool
is_record
=
false
;
bool
is_authors
=
false
;
bool
is_paper
=
false
;
uint64_t
record_count
=
0
;
Json
::
Value
row
;
Json
::
Value
coauthor
s
;
PaperRecord
paper
;
ofstream
&
of
s
;
std
::
string
paper_key
;
std
::
vector
<
string
>
coauthor_list
;
int
year
;
DblpPaperHandler
(
ofstream
&
fout
)
:
ofs
(
fout
)
{
}
int
count
=
1
;
for
(
auto
it
=
root
.
begin
();
it
!=
root
.
end
();
++
it
)
{
//
bool
Null
()
{
return
true
;
}
bool
Bool
(
bool
b
)
{
//cout << "Bool(" << boolalpha << b << ")" << endl;
return
true
;
}
bool
Int
(
int
i
)
{
//cout << "Int(" << i << ")" << endl;
return
true
;
}
bool
Uint
(
unsigned
u
)
{
//cout << "Uint(" << u << ")" << endl;
if
(
is_paper
)
{
paper
.
year
=
u
;
}
//Àüó¸®
row
.
clear
();
coauthors
.
clear
();
coauthor_list
.
clear
();
return
true
;
}
bool
Int64
(
int64_t
i
)
{
//cout << "Int64(" << i << ")" << endl;
return
true
;
}
bool
Uint64
(
uint64_t
u
)
{
//cout << "Uint64(" << u << ")" << endl;
return
true
;
}
bool
Double
(
double
d
)
{
//cout << "Double(" << d << ")" << endl;
return
true
;
}
bool
RawNumber
(
const
char
*
str
,
rapidjson
::
SizeType
length
,
bool
copy
)
{
//cout << "Number(" << str << ", " << length << ", " << boolalpha << copy << ")" << endl;
return
true
;
}
bool
String
(
const
char
*
str
,
rapidjson
::
SizeType
length
,
bool
copy
)
{
if
(
is_record
)
{
if
(
is_authors
)
{
if
(
!
is_paper
)
return
true
;
//print
if
(
count
%
1000000
==
0
)
{
printf
(
"* [%d]
\n
"
,
count
);
paper
.
add_author
(
string
(
str
));
}
else
{
if
(
boost
::
regex_match
(
str
,
paper_reg
))
{
is_paper
=
true
;
paper
.
paper_key
=
string
(
str
);
}
}
//row ´ÜÀ§·Î read
row
=
*
it
;
paper_key
=
row
[
0
].
asString
();
//check whether it is paper
if
(
boost
::
regex_match
(
paper_key
,
paper_reg
))
{
coauthors
=
row
[
1
];
for
(
auto
coit
=
coauthors
.
begin
();
coit
!=
coauthors
.
end
();
++
coit
)
{
coauthor_list
.
push_back
(
coit
->
asString
());
}
return
true
;
}
bool
StartObject
()
{
//cout << "StartObject()" << endl;
return
true
;
}
bool
Key
(
const
char
*
str
,
rapidjson
::
SizeType
length
,
bool
copy
)
{
//cout << "Key(" << str << ", " << length << ", " << boolalpha << copy << ")" << endl;
return
true
;
}
bool
EndObject
(
rapidjson
::
SizeType
memberCount
)
{
//cout << "EndObject(" << memberCount << ")" << endl;
return
true
;
}
bool
StartArray
()
{
if
(
!
whole_array
)
{
whole_array
=
true
;
}
else
if
(
!
is_record
)
{
is_record
=
true
;
}
else
if
(
!
is_authors
)
{
is_authors
=
true
;
}
return
true
;
}
bool
EndArray
(
rapidjson
::
SizeType
elementCount
)
{
if
(
is_record
)
{
if
(
is_authors
)
{
is_authors
=
false
;
}
else
{
if
(
is_paper
)
{
paper
.
write
(
ofs
);
paper
.
clear
();
}
year
=
((
row
[
2
].
isNull
())
?-
1
:
row
[
2
].
asInt
());
//write
dblp_paper_out
<<
paper_key
<<
COLUMN_DELIMITER
;
if
(
coauthor_list
.
size
()
>
0
)
{
for
(
auto
auit
=
coauthor_list
.
begin
();
auit
!=
coauthor_list
.
end
();)
{
dblp_paper_out
<<
(
*
auit
);
++
auit
;
if
(
auit
!=
coauthor_list
.
end
())
{
dblp_paper_out
<<
AUTHOR_DELIMITER
;
}
}
}
else
{
//empty
throw
exception
(
"paper without author"
);
is_record
=
false
;
is_paper
=
false
;
++
record_count
;
if
(
record_count
%
100000
==
0
)
{
printf
(
"* [%"
PRIu64
"]
\n
"
,
record_count
);
}
dblp_paper_out
<<
COLUMN_DELIMITER
<<
year
<<
endl
;
}
else
{
//not paper
}
}
else
{
whole_array
=
false
;
printf
(
"* total paper record: [%"
PRIu64
"]
\n
"
,
record_count
);
}
return
true
;
}
};
//ÈÄó¸®
++
count
;
int
main
()
{
try
{
ifstream
dblp_paper_in
,
dblp_coauthor_in
;
ofstream
dblp_paper_out
,
dblp_coauthor_out
;
dblp_paper_in
.
open
(
DBLP_FILENAME
);
dblp_paper_out
.
open
((
string
(
DBLP_FILENAME
)
+
string
(
".out"
)).
c_str
());
if
(
!
dblp_paper_in
||
!
dblp_paper_out
)
{
throw
exception
(
"dblp paper file"
);
}
printf
(
"* complete convert dblp paper
\n
"
);
DblpPaperHandler
paper_handler
(
dblp_paper_out
);
//2. dblp coauthorship dataset
//dblp_coauthor.open(DBLP_COAUTHOR_FILENAME)
;
rapidjson
::
IStreamWrapper
dblp_paper_isw
(
dblp_paper_in
);
rapidjson
::
Reader
reader
;
reader
.
Parse
(
dblp_paper_isw
,
paper_handler
);
//release
if
(
dblp_paper_in
)
dblp_paper_in
.
close
();
if
(
dblp_paper_out
)
dblp_paper_out
.
close
();
if
(
dblp_coauthor_in
)
dblp_coauthor_in
.
close
();
if
(
dblp_coauthor_out
)
dblp_coauthor_out
.
close
();
}
catch
(
const
exception
&
e
)
{
cerr
<<
"Error: "
<<
e
.
what
()
<<
endl
;
return
-
1
;
}
//release
if
(
dblp_paper
)
dblp_paper
.
close
();
if
(
dblp_coauthor
)
dblp_coauthor
.
close
();
if
(
dblp_paper_out
)
dblp_paper_out
.
close
();
if
(
dblp_coauthor_out
)
dblp_coauthor_out
.
close
();
return
0
;
}
\ No newline at end of file
...
...
Please
register
or
login
to post a comment