Toggle navigation
Toggle navigation
This project
Loading...
Sign in
신은섭(Shin Eun Seop)
/
Detecting_fraud_clicks
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
2
Merge Requests
0
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
tnt-ooo-tnt
2018-06-06 05:20:33 +0900
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
b8d8bcf044a100dd864eb2dbfb14c800aa4c0474
b8d8bcf0
1 parent
f5eca027
Java-Cesco/Detecting-fraud-clicks#13-Draft
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
206 additions
and
181 deletions
src/main/java/Aggregation.java
src/main/java/GUI.java
src/main/java/Aggregation.java
deleted
100644 → 0
View file @
f5eca02
import
org.apache.spark.api.java.function.MapFunction
;
import
org.apache.spark.sql.Dataset
;
import
org.apache.spark.sql.Encoders
;
import
org.apache.spark.sql.Row
;
import
org.apache.spark.sql.SparkSession
;
import
org.apache.spark.sql.expressions.Window
;
import
org.apache.spark.sql.expressions.WindowSpec
;
import
java.util.ArrayList
;
import
java.util.List
;
import
static
org
.
apache
.
spark
.
sql
.
functions
.*;
import
static
org
.
apache
.
spark
.
sql
.
functions
.
lit
;
import
static
org
.
apache
.
spark
.
sql
.
functions
.
when
;
public
class
Aggregation
{
public
static
void
main
(
String
[]
args
)
throws
Exception
{
//Create Session
SparkSession
spark
=
SparkSession
.
builder
()
.
appName
(
"Detecting Fraud Clicks"
)
.
master
(
"local"
)
.
getOrCreate
();
// Aggregation
Aggregation
agg
=
new
Aggregation
();
Dataset
<
Row
>
dataset
=
agg
.
loadCSVDataSet
(
"/home/chris/.kaggle/competitions/talkingdata-adtracking-fraud-detection/mnt/ssd/kaggle-talkingdata2/competition_files/train_sample.csv"
,
spark
);
dataset
=
agg
.
changeTimestempToLong
(
dataset
);
dataset
=
agg
.
averageValidClickCount
(
dataset
);
dataset
=
agg
.
clickTimeDelta
(
dataset
);
dataset
=
agg
.
countClickInTenMinutes
(
dataset
);
long
start
=
System
.
currentTimeMillis
();
List
<
String
>
logs_with_features
=
dataset
.
map
(
row
->
row
.
toString
(),
Encoders
.
STRING
()).
collectAsList
();
String
[][]
contents
=
new
String
[(
int
)
dataset
.
count
()][
11
];
for
(
int
i
=
0
;
i
<
logs_with_features
.
size
();
i
++){
String
str_to_split
=
logs_with_features
.
get
(
i
);
String
[]
tmp
=
str_to_split
.
substring
(
1
,
str_to_split
.
length
()-
1
).
split
(
","
);
contents
[
i
]
=
tmp
;
}
long
end
=
System
.
currentTimeMillis
();
System
.
out
.
println
(
"JK's Procedure time elapsed : "
+
(
end
-
start
)/
1000.0
);
start
=
System
.
currentTimeMillis
();
List
<
String
>
stringDataset
=
dataset
.
toJSON
().
collectAsList
();
end
=
System
.
currentTimeMillis
();
System
.
out
.
println
(
"Steve's Procedure 1 time elapsed : "
+
(
end
-
start
)/
1000.0
);
new
GUI
(
stringDataset
,
contents
);
}
private
Dataset
<
Row
>
loadCSVDataSet
(
String
path
,
SparkSession
spark
){
// Read SCV to DataSet
return
spark
.
read
().
format
(
"csv"
)
.
option
(
"inferSchema"
,
"true"
)
.
option
(
"header"
,
"true"
)
.
load
(
path
);
}
private
Dataset
<
Row
>
changeTimestempToLong
(
Dataset
<
Row
>
dataset
){
// cast timestamp to long
Dataset
<
Row
>
newDF
=
dataset
.
withColumn
(
"utc_click_time"
,
dataset
.
col
(
"click_time"
).
cast
(
"long"
));
newDF
=
newDF
.
withColumn
(
"utc_attributed_time"
,
dataset
.
col
(
"attributed_time"
).
cast
(
"long"
));
newDF
=
newDF
.
drop
(
"click_time"
).
drop
(
"attributed_time"
);
return
newDF
;
}
private
Dataset
<
Row
>
averageValidClickCount
(
Dataset
<
Row
>
dataset
){
// set Window partition by 'ip' and 'app' order by 'utc_click_time' select rows between 1st row to current row
WindowSpec
w
=
Window
.
partitionBy
(
"ip"
,
"app"
)
.
orderBy
(
"utc_click_time"
)
.
rowsBetween
(
Window
.
unboundedPreceding
(),
Window
.
currentRow
());
// aggregation
Dataset
<
Row
>
newDF
=
dataset
.
withColumn
(
"cum_count_click"
,
count
(
"utc_click_time"
).
over
(
w
));
newDF
=
newDF
.
withColumn
(
"cum_sum_attributed"
,
sum
(
"is_attributed"
).
over
(
w
));
newDF
=
newDF
.
withColumn
(
"avg_valid_click_count"
,
col
(
"cum_sum_attributed"
).
divide
(
col
(
"cum_count_click"
)));
newDF
=
newDF
.
drop
(
"cum_count_click"
,
"cum_sum_attributed"
);
return
newDF
;
}
private
Dataset
<
Row
>
clickTimeDelta
(
Dataset
<
Row
>
dataset
){
WindowSpec
w
=
Window
.
partitionBy
(
"ip"
)
.
orderBy
(
"utc_click_time"
);
Dataset
<
Row
>
newDF
=
dataset
.
withColumn
(
"lag(utc_click_time)"
,
lag
(
"utc_click_time"
,
1
).
over
(
w
));
newDF
=
newDF
.
withColumn
(
"click_time_delta"
,
when
(
col
(
"lag(utc_click_time)"
).
isNull
(),
lit
(
0
)).
otherwise
(
col
(
"utc_click_time"
)).
minus
(
when
(
col
(
"lag(utc_click_time)"
).
isNull
(),
lit
(
0
)).
otherwise
(
col
(
"lag(utc_click_time)"
))));
newDF
=
newDF
.
drop
(
"lag(utc_click_time)"
);
return
newDF
;
}
private
Dataset
<
Row
>
countClickInTenMinutes
(
Dataset
<
Row
>
dataset
){
WindowSpec
w
=
Window
.
partitionBy
(
"ip"
)
.
orderBy
(
"utc_click_time"
)
.
rangeBetween
(
Window
.
currentRow
(),
Window
.
currentRow
()+
600
);
Dataset
<
Row
>
newDF
=
dataset
.
withColumn
(
"count_click_in_ten_mins"
,
(
count
(
"utc_click_time"
).
over
(
w
)).
minus
(
1
));
//TODO 본인것 포함할 것인지 정해야함.
return
newDF
;
}
}
\ No newline at end of file
src/main/java/GUI.java
View file @
b8d8bcf
...
...
@@ -4,22 +4,15 @@ import org.apache.spark.sql.Row;
import
javax.swing.*
;
import
java.awt.*
;
import
java.io.BufferedReader
;
import
java.io.File
;
import
java.io.StringReader
;
import
java.sql.ResultSet
;
import
java.sql.ResultSetMetaData
;
import
java.sql.Statement
;
import
java.util.List
;
import
java.util.Vector
;
import
java.awt.BorderLayout
;
import
java.awt.GridLayout
;
import
java.awt.event.ActionEvent
;
import
java.awt.event.ActionListener
;
import
java.sql.Connection
;
import
java.sql.DriverManager
;
import
java.sql.ResultSet
;
import
java.sql.ResultSetMetaData
;
import
java.sql.Statement
;
import
java.util.Vector
;
import
javax.swing.JButton
;
import
javax.swing.JFrame
;
...
...
@@ -28,31 +21,35 @@ import javax.swing.JPanel;
import
javax.swing.JScrollPane
;
import
javax.swing.JTable
;
import
javax.swing.JTextField
;
import
javax.swing.
table.AbstractTableModel
;
import
javax.swing.
filechooser.FileFilter
;
import
javax.swing.table.DefaultTableModel
;
import
org.apache.spark.sql.SparkSession
;
import
org.apache.spark.sql.expressions.Window
;
import
org.apache.spark.sql.expressions.WindowSpec
;
import
static
org
.
apache
.
spark
.
sql
.
functions
.*;
import
static
org
.
apache
.
spark
.
sql
.
functions
.
lit
;
import
static
org
.
apache
.
spark
.
sql
.
functions
.
when
;
public
class
GUI
extends
JFrame
{
JTabbedPane
tab
=
new
JTabbedPane
();
public
GUI
(
List
<
String
>
q
,
String
[][]
data
)
{
public
GUI
()
{
super
(
"CESCO"
);
tab
.
addTab
(
"png"
,
new
PngPane
());
tab
.
addTab
(
"gif"
,
new
GifPane
());
tab
.
addTab
(
"jpg"
,
new
JpgPane
());
tab
.
addTab
(
"table"
,
new
createTable
(
q
));
tab
.
addTab
(
"processed_features"
,
new
createTable_alter
(
data
));
tab
.
addTab
(
"main"
,
new
CreateTable_tab
());
tab
.
addTab
(
"graphics"
,
new
PngPane
());
add
(
tab
);
setSize
(
800
,
500
);
// 윈도우의 크기 가로x세로
setSize
(
1280
,
1024
);
// 윈도우의 크기 가로x세로
setVisible
(
true
);
// 창을 보여줄떄 true, 숨길때 false
setDefaultCloseOperation
(
JFrame
.
EXIT_ON_CLOSE
);
// x 버튼을 눌렀을때 종료
}
// public static void main(String args[]) {
// new GUI();
// }
public
static
void
main
(
String
args
[])
{
new
GUI
();
}
}
class
PngPane
extends
JPanel
{
...
...
@@ -65,61 +62,147 @@ class PngPane extends JPanel {
}
}
class
GifPane
extends
JPanel
{
public
GifPane
()
{
super
();
ImageIcon
image
=
new
ImageIcon
(
"data/model.gif"
);
JLabel
label
=
new
JLabel
(
""
,
image
,
JLabel
.
CENTER
);
setLayout
(
new
BorderLayout
());
add
(
label
,
BorderLayout
.
CENTER
);
}
}
class
CreateTable_tab
extends
JPanel
{
public
JPanel
centre_pane
=
new
JPanel
();
public
JPanel
south_pane
=
new
JPanel
();
public
JScrollPane
pan1
=
new
JScrollPane
();
public
JTable
table1
=
new
JTable
();
public
JButton
btn1
=
new
JButton
(
"CONFIRM"
);
public
JScrollPane
pan2
=
new
JScrollPane
();
public
JTable
table2
=
new
JTable
();
class
JpgPane
extends
JPanel
{
public
JpgPane
()
{
public
JScrollPane
pan3
=
new
JScrollPane
();
public
JTable
table3
=
new
JTable
();
private
DefaultTableModel
tableModel1
=
new
DefaultTableModel
(
new
Object
[]{
"unknown"
},
1
);
private
DefaultTableModel
tableModel2
=
new
DefaultTableModel
(
new
Object
[]{
"unknown"
},
1
);
private
DefaultTableModel
tableModel3
=
new
DefaultTableModel
(
new
Object
[]{
"unknown"
},
1
);
public
CsvFile_chooser
temp
=
new
CsvFile_chooser
();
public
CreateTable_tab
(){
super
();
ImageIcon
image
=
new
ImageIcon
(
"data/model.jpg"
);
JLabel
label
=
new
JLabel
(
""
,
image
,
JLabel
.
CENTER
);
setLayout
(
new
BorderLayout
());
add
(
label
,
BorderLayout
.
CENTER
);
//csvFile_chooser
add
(
temp
,
BorderLayout
.
NORTH
);
// sub Panel 1
centre_pane
.
setLayout
(
new
GridLayout
(
1
,
3
));
pan1
.
setViewportView
(
table1
);
centre_pane
.
add
(
pan1
);
// sub Panel 2
pan2
.
setViewportView
(
table2
);
centre_pane
.
add
(
pan2
);
// sub Panel 3
pan3
.
setViewportView
(
table3
);
centre_pane
.
add
(
pan3
);
//sub Panel 4
south_pane
.
setLayout
(
new
FlowLayout
());
south_pane
.
add
(
btn1
);
btn1
.
addActionListener
(
new
ActionListener
()
{
@Override
public
void
actionPerformed
(
ActionEvent
e
)
{
if
(
temp
.
is_selected
)
{
String
path
=
temp
.
selected_file
.
getAbsolutePath
();
// 1st Column Raw Data
SparkSession
spark
=
SparkSession
.
builder
()
.
appName
(
"Detecting Fraud Clicks"
)
.
master
(
"local"
)
.
getOrCreate
();
// Aggregation
Aggregation
agg
=
new
Aggregation
();
// Raw data
TableCreator
table_maker
=
new
TableCreator
();
Dataset
<
Row
>
dataset
=
agg
.
loadCSVDataSet
(
path
,
spark
);
List
<
String
>
stringDataset_Raw
=
dataset
.
toJSON
().
collectAsList
();
String
[]
header_r
=
{
"ip"
,
"app"
,
"device"
,
"os"
,
"channel"
,
"click_time"
,
"is_attributed"
};
table1
.
setModel
(
table_maker
.
getTableModel
(
stringDataset_Raw
,
header_r
));
// 2nd Column Data with features
// Adding features
dataset
=
agg
.
changeTimestempToLong
(
dataset
);
dataset
=
agg
.
averageValidClickCount
(
dataset
);
dataset
=
agg
.
clickTimeDelta
(
dataset
);
dataset
=
agg
.
countClickInTenMinutes
(
dataset
);
List
<
String
>
stringDataset_feat
=
dataset
.
toJSON
().
collectAsList
();
String
[]
header_f
=
{
"ip"
,
"app"
,
"device"
,
"os"
,
"channel"
,
"is_attributed"
,
"click_time"
,
"avg_valid_click_count"
,
"click_time_delta"
,
"count_click_in_ten_mins"
};
table2
.
setModel
(
table_maker
.
getTableModel
(
stringDataset_feat
,
header_f
));
// 3nd Column Final results
}
}
});
add
(
centre_pane
,
BorderLayout
.
CENTER
);
add
(
south_pane
,
BorderLayout
.
SOUTH
);
}
}
class
createTable_alter
extends
JPanel
{
private
String
[]
header
=
{
"ip"
,
"app"
,
"device"
,
"os"
,
"channel"
,
"is_attributed"
,
"click_time"
,
"attributed_time"
,
"avg_valid_click_count"
,
"click_time_delta"
,
"count_click_in_tenmin"
};
/*
root
|-- ip: integer (nullable = true)
|-- app: integer (nullable = true)
|-- device: integer (nullable = true)
|-- os: integer (nullable = true)
|-- channel: integer (nullable = true)
|-- is_attributed: integer (nullable = true)
|-- utc_click_time: long (nullable = true)
|-- utc_attributed_time: long (nullable = true)
|-- avg_valid_click_count: double (nullable = true)
|-- click_time_delta: long (nullable = true)
|-- count_click_in_ten_mins: long (nullable = false)
*/
public
createTable_alter
(
String
[][]
data
){
JTable
processed_table
=
new
JTable
(
data
,
header
);
JScrollPane
jScrollPane
=
new
JScrollPane
(
processed_table
);
add
(
jScrollPane
);
class
CsvFile_chooser
extends
JPanel
{
private
JFileChooser
chooser
=
new
JFileChooser
();
private
JTextField
path_field
=
new
JTextField
(
30
);
private
JButton
browser
=
new
JButton
(
"..."
);
public
File
selected_file
;
boolean
is_selected
=
false
;
public
CsvFile_chooser
(){
setLayout
(
new
FlowLayout
());
chooser
.
addChoosableFileFilter
(
new
FileFilter
()
{
@Override
public
boolean
accept
(
File
f
)
{
if
(
f
.
isDirectory
())
{
return
true
;
}
else
{
return
f
.
getName
().
toLowerCase
().
endsWith
(
".csv"
);
}
}
@Override
public
String
getDescription
()
{
return
"CSV files (*.csv)"
;
}
});
add
(
path_field
);
add
(
browser
);
browser
.
addActionListener
(
new
ActionListener
(){
@Override
public
void
actionPerformed
(
ActionEvent
e
)
{
Object
obj
=
e
.
getSource
();
if
((
JButton
)
obj
==
browser
){
if
(
chooser
.
showOpenDialog
(
null
)
==
JFileChooser
.
APPROVE_OPTION
){
selected_file
=
chooser
.
getSelectedFile
();
String
path
=
selected_file
.
getAbsolutePath
();
path_field
.
setText
(
path
);
is_selected
=
true
;
}
}
}
});
}
}
class
createTable
extends
JPanel
{
long
start
=
System
.
currentTimeMillis
();
public
createTable
(
List
<
String
>
data
)
{
//constructor : display table
getTableModel
(
data
);
}
class
TableCreator
extends
JPanel
{
public
DefaultTableModel
model
;
private
DefaultTableModel
getTableModel
(
List
<
String
>
data
)
{
String
column_n
[]={
"ip"
,
"app"
,
"device"
,
"os"
,
"channel"
,
"is_attributed"
,
"click_time"
,
"avg_valid_click_count"
,
"click_time_delta"
,
"count_click_in_tenmin"
};
public
DefaultTableModel
getTableModel
(
List
<
String
>
data
,
String
[]
header
)
{
Object
tabledata
[][]={};
DefaultTableModel
model
=
new
DefaultTableModel
(
tabledata
,
column_n
);
DefaultTableModel
model
=
new
DefaultTableModel
(
tabledata
,
header
);
JTable
jtable
=
new
JTable
(
model
);
JScrollPane
jScollPane
=
new
JScrollPane
(
jtable
);
add
(
jScollPane
);
...
...
@@ -131,11 +214,9 @@ class createTable extends JPanel {
line
=
line
.
replace
(
"\""
,
""
);
line
=
line
.
replace
(
"_"
,
""
);
//line = line.replace("\\{","");
line
=
line
.
replaceAll
(
"\\{|\\}"
,
""
);
line
=
line
.
replaceAll
(
"\\w+:"
,
""
);
//System.out.println(line);
Object
[]
temp
=
line
.
split
(
","
);
model
.
addRow
(
temp
);
...
...
@@ -146,8 +227,6 @@ class createTable extends JPanel {
}
catch
(
Exception
e
)
{
System
.
out
.
println
(
e
);
}
long
end
=
System
.
currentTimeMillis
();
System
.
out
.
println
(
"Steve's Procedure2 time elapsed : "
+
(
end
-
start
)/
1000.0
);
return
model
;
}
...
...
@@ -162,4 +241,60 @@ class createTable extends JPanel {
return
reader
;
}
}
class
Aggregation
{
public
Dataset
<
Row
>
loadCSVDataSet
(
String
path
,
SparkSession
spark
){
// Read SCV to DataSet
return
spark
.
read
().
format
(
"csv"
)
.
option
(
"inferSchema"
,
"true"
)
.
option
(
"header"
,
"true"
)
.
load
(
path
);
}
public
Dataset
<
Row
>
changeTimestempToLong
(
Dataset
<
Row
>
dataset
){
// cast timestamp to long
Dataset
<
Row
>
newDF
=
dataset
.
withColumn
(
"utc_click_time"
,
dataset
.
col
(
"click_time"
).
cast
(
"long"
));
newDF
=
newDF
.
withColumn
(
"utc_attributed_time"
,
dataset
.
col
(
"attributed_time"
).
cast
(
"long"
));
newDF
=
newDF
.
drop
(
"click_time"
).
drop
(
"attributed_time"
);
return
newDF
;
}
public
Dataset
<
Row
>
averageValidClickCount
(
Dataset
<
Row
>
dataset
){
// set Window partition by 'ip' and 'app' order by 'utc_click_time' select rows between 1st row to current row
WindowSpec
w
=
Window
.
partitionBy
(
"ip"
,
"app"
)
.
orderBy
(
"utc_click_time"
)
.
rowsBetween
(
Window
.
unboundedPreceding
(),
Window
.
currentRow
());
// aggregation
Dataset
<
Row
>
newDF
=
dataset
.
withColumn
(
"cum_count_click"
,
count
(
"utc_click_time"
).
over
(
w
));
newDF
=
newDF
.
withColumn
(
"cum_sum_attributed"
,
sum
(
"is_attributed"
).
over
(
w
));
newDF
=
newDF
.
withColumn
(
"avg_valid_click_count"
,
col
(
"cum_sum_attributed"
).
divide
(
col
(
"cum_count_click"
)));
newDF
=
newDF
.
drop
(
"cum_count_click"
,
"cum_sum_attributed"
);
return
newDF
;
}
public
Dataset
<
Row
>
clickTimeDelta
(
Dataset
<
Row
>
dataset
){
WindowSpec
w
=
Window
.
partitionBy
(
"ip"
)
.
orderBy
(
"utc_click_time"
);
Dataset
<
Row
>
newDF
=
dataset
.
withColumn
(
"lag(utc_click_time)"
,
lag
(
"utc_click_time"
,
1
).
over
(
w
));
newDF
=
newDF
.
withColumn
(
"click_time_delta"
,
when
(
col
(
"lag(utc_click_time)"
).
isNull
(),
lit
(
0
)).
otherwise
(
col
(
"utc_click_time"
)).
minus
(
when
(
col
(
"lag(utc_click_time)"
).
isNull
(),
lit
(
0
)).
otherwise
(
col
(
"lag(utc_click_time)"
))));
newDF
=
newDF
.
drop
(
"lag(utc_click_time)"
);
return
newDF
;
}
public
Dataset
<
Row
>
countClickInTenMinutes
(
Dataset
<
Row
>
dataset
){
WindowSpec
w
=
Window
.
partitionBy
(
"ip"
)
.
orderBy
(
"utc_click_time"
)
.
rangeBetween
(
Window
.
currentRow
(),
Window
.
currentRow
()+
600
);
Dataset
<
Row
>
newDF
=
dataset
.
withColumn
(
"count_click_in_ten_mins"
,
(
count
(
"utc_click_time"
).
over
(
w
)).
minus
(
1
));
//TODO 본인것 포함할 것인지 정해야함.
return
newDF
;
}
}
\ No newline at end of file
...
...
Please
register
or
login
to post a comment