Committed by
GitHub
Merge pull request #17 from Java-Cesco/feature/tenMinsHG
Feature/ten mins hg
Showing
6 changed files
with
56 additions
and
4 deletions
.idea/.name
0 → 100644
1 | +Detecting_fraud_clicks | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
data/train.csv
0 → 100644
This diff could not be displayed because it is too large.
data/train_.csv
0 → 100644
1 | +ip,app,device,os,channel,click_time,attributed_time,is_attributed | ||
2 | +117898,12,1,13,497,2017-11-07 09:30:38,,0 | ||
3 | +117898,12,1,13,497,2017-11-07 09:30:38,,0 | ||
4 | +117898,12,1,13,497,2017-11-07 09:31:38,,0 | ||
5 | +117898,12,1,13,497,2017-11-07 09:31:38,,0 | ||
6 | +117898,12,1,13,497,2017-11-07 09:31:38,,0 | ||
7 | +117898,12,1,13,497,2017-11-07 09:39:38,,0 | ||
8 | +117898,12,1,13,497,2017-11-07 09:40:38,,0 | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
... | @@ -16,19 +16,21 @@ | ... | @@ -16,19 +16,21 @@ |
16 | <artifactId>spark-core_2.11</artifactId> | 16 | <artifactId>spark-core_2.11</artifactId> |
17 | <version>2.3.0</version> | 17 | <version>2.3.0</version> |
18 | </dependency> | 18 | </dependency> |
19 | - | 19 | + <dependency> |
20 | + <groupId>org.apache.spark</groupId> | ||
21 | + <artifactId>spark-sql_2.11</artifactId> | ||
22 | + <version>2.2.0</version> | ||
23 | + </dependency> | ||
20 | <dependency> | 24 | <dependency> |
21 | <groupId>org.apache.spark</groupId> | 25 | <groupId>org.apache.spark</groupId> |
22 | <artifactId>spark-sql_2.11</artifactId> | 26 | <artifactId>spark-sql_2.11</artifactId> |
23 | <version>2.3.0</version> | 27 | <version>2.3.0</version> |
24 | </dependency> | 28 | </dependency> |
25 | - | ||
26 | <dependency> | 29 | <dependency> |
27 | <groupId>com.databricks</groupId> | 30 | <groupId>com.databricks</groupId> |
28 | <artifactId>spark-csv_2.11</artifactId> | 31 | <artifactId>spark-csv_2.11</artifactId> |
29 | <version>1.5.0</version> | 32 | <version>1.5.0</version> |
30 | </dependency> | 33 | </dependency> |
31 | - | ||
32 | </dependencies> | 34 | </dependencies> |
33 | 35 | ||
34 | <build> | 36 | <build> |
... | @@ -45,5 +47,4 @@ | ... | @@ -45,5 +47,4 @@ |
45 | </plugins> | 47 | </plugins> |
46 | </build> | 48 | </build> |
47 | 49 | ||
48 | - | ||
49 | </project> | 50 | </project> |
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
src/main/java/CountTen.java
0 → 100644
1 | +import org.apache.spark.sql.Column; | ||
2 | +import org.apache.spark.sql.Dataset; | ||
3 | +import org.apache.spark.sql.Row; | ||
4 | +import org.apache.spark.sql.SparkSession; | ||
5 | +import org.apache.spark.sql.expressions.Window; | ||
6 | +import org.apache.spark.sql.expressions.WindowSpec; | ||
7 | + | ||
8 | +import static org.apache.spark.sql.functions.*; | ||
9 | + | ||
10 | + | ||
11 | +public class CountTen { | ||
12 | + | ||
13 | + public static void main(String[] args) throws Exception { | ||
14 | + SparkSession spark = SparkSession | ||
15 | + .builder() | ||
16 | + .master("local") | ||
17 | + .appName("Java Spark SQL basic example") | ||
18 | + .getOrCreate(); | ||
19 | + | ||
20 | + Dataset<Row> df = spark.read().format("csv") | ||
21 | + .option("inferSchema", "true") | ||
22 | + .option("header", "true") | ||
23 | + .load("./data/train_.csv"); | ||
24 | + | ||
25 | + // cast timestamp to long | ||
26 | + Dataset<Row> newdf = df.withColumn("utc_click_time", df.col("click_time").cast("long")); | ||
27 | + newdf = newdf.withColumn("utc_attributed_time", df.col("attributed_time").cast("long")); | ||
28 | + newdf = newdf.drop("click_time").drop("attributed_time"); | ||
29 | + | ||
30 | + WindowSpec w = Window.partitionBy("ip") | ||
31 | + .orderBy("utc_click_time") | ||
32 | + .rangeBetween(Window.currentRow(),Window.currentRow()+600); | ||
33 | +// .rowsBetween(Window.currentRow(), Window.unboundedPreceding()); //Boundary end is not a valid integer: -9223372036854775808 | ||
34 | + | ||
35 | + newdf = newdf.withColumn("is_clicked_in_ten_mins", | ||
36 | + (count("utc_click_time").over(w)).minus(1)); //본인것 포함할 것인지 정해야함. | ||
37 | +// newdf = newdf.withColumn("is_clicked_in_ten_mins", | ||
38 | +// (lead(col("utc_click_time"),1).over(w).minus(col("utc_click_time")).lt((long)600)).cast("long")); | ||
39 | + | ||
40 | + newdf.where("ip == '117898'").show(false); | ||
41 | + } | ||
42 | +} | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or login to post a comment