Toggle navigation
Toggle navigation
This project
Loading...
Sign in
신은섭(Shin Eun Seop)
/
Detecting_fraud_clicks
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
2
Merge Requests
0
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
hyungyun.Moon
2018-05-28 16:00:59 +0900
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
dcc8c3717051f78c20ad0f96d876f801be08714d
dcc8c371
1 parent
efbc91aa
change lead to count
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
14 additions
and
3 deletions
data/train_.csv
src/main/java/CountTen.java
data/train_.csv
0 → 100644
View file @
dcc8c37
ip,app,device,os,channel,click_time,attributed_time,is_attributed
117898,12,1,13,497,2017-11-07 09:30:38,,0
117898,12,1,13,497,2017-11-07 09:30:38,,0
117898,12,1,13,497,2017-11-07 09:31:38,,0
117898,12,1,13,497,2017-11-07 09:31:38,,0
117898,12,1,13,497,2017-11-07 09:31:38,,0
117898,12,1,13,497,2017-11-07 09:39:38,,0
117898,12,1,13,497,2017-11-07 09:40:38,,0
\ No newline at end of file
src/main/java/CountTen.java
View file @
dcc8c37
...
...
@@ -20,7 +20,7 @@ public class CountTen {
Dataset
<
Row
>
df
=
spark
.
read
().
format
(
"csv"
)
.
option
(
"inferSchema"
,
"true"
)
.
option
(
"header"
,
"true"
)
.
load
(
"./data/train.csv"
);
.
load
(
"./data/train
_
.csv"
);
// cast timestamp to long
Dataset
<
Row
>
newdf
=
df
.
withColumn
(
"utc_click_time"
,
df
.
col
(
"click_time"
).
cast
(
"long"
));
...
...
@@ -28,11 +28,14 @@ public class CountTen {
newdf
=
newdf
.
drop
(
"click_time"
).
drop
(
"attributed_time"
);
WindowSpec
w
=
Window
.
partitionBy
(
"ip"
)
.
orderBy
(
"utc_click_time"
);
.
orderBy
(
"utc_click_time"
)
.
rangeBetween
(
Window
.
currentRow
(),
Window
.
currentRow
()+
600
);
// .rowsBetween(Window.currentRow(), Window.unboundedPreceding()); //Boundary end is not a valid integer: -9223372036854775808
newdf
=
newdf
.
withColumn
(
"is_clicked_in_ten_mins"
,
(
lead
(
col
(
"utc_click_time"
),
1
).
over
(
w
).
minus
(
col
(
"utc_click_time"
)).
lt
((
long
)
600
)).
cast
(
"long"
));
(
count
(
"utc_click_time"
).
over
(
w
)).
minus
(
1
));
//본인것 포함할 것인지 정해야함.
// newdf = newdf.withColumn("is_clicked_in_ten_mins",
// (lead(col("utc_click_time"),1).over(w).minus(col("utc_click_time")).lt((long)600)).cast("long"));
newdf
.
where
(
"ip == '117898'"
).
show
(
false
);
}
...
...
Please
register
or
login
to post a comment