Toggle navigation
Toggle navigation
This project
Loading...
Sign in
신은섭(Shin Eun Seop)
/
Detecting_fraud_clicks
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
2
Merge Requests
0
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
신은섭(Shin Eun Seop)
2018-05-25 19:33:01 +0900
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
dd04a0b37a1e763f0c35641e64a0391e1dee74e0
dd04a0b3
1 parent
68f248cd
add average ad efficient field
closed Java-Cesco/Detecting_fraud_clicks#3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
28 additions
and
15 deletions
.idea/Detecting_fraud_clicks.iml
.idea/misc.xml
src/main/java/AvgAdvTime.java
.idea/Detecting_fraud_clicks.iml
View file @
dd04a0b
...
...
@@ -12,6 +12,8 @@
</content>
<orderEntry
type=
"inheritedJdk"
/>
<orderEntry
type=
"sourceFolder"
forTests=
"false"
/>
<orderEntry
type=
"library"
name=
"Maven: com.databricks:spark-csv_2.11:1.5.0"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.apache.commons:commons-csv:1.1"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.apache.spark:spark-core_2.11:2.3.0"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.apache.avro:avro:1.7.7"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.codehaus.jackson:jackson-core-asl:1.9.13"
level=
"project"
/>
...
...
.idea/misc.xml
View file @
dd04a0b
...
...
@@ -3,6 +3,13 @@
<component
name=
"JavaScriptSettings"
>
<option
name=
"languageLevel"
value=
"ES6"
/>
</component>
<component
name=
"MavenProjectsManager"
>
<option
name=
"originalFiles"
>
<list>
<option
value=
"$PROJECT_DIR$/pom.xml"
/>
</list>
</option>
</component>
<component
name=
"ProjectRootManager"
version=
"2"
languageLevel=
"JDK_1_8"
default=
"false"
project-jdk-name=
"1.8"
project-jdk-type=
"JavaSDK"
>
<output
url=
"file:///tmp"
/>
</component>
...
...
src/main/java/AvgAdvTime.java
View file @
dd04a0b
import
org.apache.commons.net.ntp.TimeStamp
;
import
org.apache.spark.Aggregator
;
import
org.apache.spark.SparkConf
;
import
org.apache.spark.api.java.JavaSparkContext
;
import
org.apache.spark.api.java.function.MapFunction
;
import
org.apache.spark.sql.*
;
import
org.apache.spark.sql.Dataset
;
import
org.apache.spark.sql.Row
;
import
org.apache.spark.sql.types.IntegerType
;
import
org.apache.spark.sql.types.LongType
;
import
org.apache.spark.sql.SparkSession
;
import
org.apache.spark.sql.expressions.Window
;
import
org.apache.spark.sql.expressions.WindowSpec
;
import
java.io.Serializable
;
import
java.sql.Time
;
import
java.sql.Timestamp
;
import
static
org
.
apache
.
spark
.
sql
.
functions
.
unix_timestamp
;
import
static
org
.
apache
.
spark
.
sql
.
functions
.
col
;
import
static
org
.
apache
.
spark
.
sql
.
functions
.
count
;
import
static
org
.
apache
.
spark
.
sql
.
functions
.
sum
;
public
class
AvgAdvTime
{
...
...
@@ -29,11 +23,21 @@ public class AvgAdvTime {
.
option
(
"inferSchema"
,
"true"
)
.
option
(
"header"
,
"true"
)
.
load
(
"train_sample.csv"
);
df
.
printSchema
();
// cast timestamp to long
Dataset
<
Row
>
newdf
=
df
.
withColumn
(
"utc_click_time"
,
df
.
col
(
"click_time"
).
cast
(
"long"
));
newdf
=
newdf
.
withColumn
(
"utc_attributed_time"
,
df
.
col
(
"attributed_time"
).
cast
(
"long"
));
newdf
.
show
();
newdf
=
newdf
.
drop
(
"click_time"
).
drop
(
"attributed_time"
);
WindowSpec
w
=
Window
.
partitionBy
(
"ip"
,
"app"
)
.
orderBy
(
"utc_click_time"
)
.
rowsBetween
(
Window
.
unboundedPreceding
(),
Window
.
currentRow
());
newdf
=
newdf
.
withColumn
(
"cum_count_click"
,
count
(
"utc_click_time"
).
over
(
w
));
newdf
=
newdf
.
withColumn
(
"cum_sum_attributed"
,
sum
(
"is_attributed"
).
over
(
w
));
newdf
=
newdf
.
withColumn
(
"avg_efficient"
,
col
(
"cum_sum_attributed"
).
divide
(
col
(
"cum_count_click"
)));
newdf
.
where
(
"ip == '5348' and app == '19'"
).
show
();
newdf
.
printSchema
();
}
}
\ No newline at end of file
...
...
Please
register
or
login
to post a comment