change lead to count

hyungyun.Moon
Commit dcc8c3717051f78c20ad0f96d876f801be08714d dcc8c371 1 parent efbc91aa
Showing 2 changed files with 14 additions and 3 deletions
data/train_.csv
src/main/java/CountTen.java
--- a/data/train_.csv 0 → 100644
View file @dcc8c37
+++ b/data/train_.csv 0 → 100644
View file @dcc8c37
+ip,app,device,os,channel,click_time,attributed_time,is_attributed
+117898,12,1,13,497,2017-11-07 09:30:38,,0
+117898,12,1,13,497,2017-11-07 09:30:38,,0
+117898,12,1,13,497,2017-11-07 09:31:38,,0
+117898,12,1,13,497,2017-11-07 09:31:38,,0
+117898,12,1,13,497,2017-11-07 09:31:38,,0
+117898,12,1,13,497,2017-11-07 09:39:38,,0
+117898,12,1,13,497,2017-11-07 09:40:38,,0
\ No newline at end of file
--- a/src/main/java/CountTen.java
View file @dcc8c37
+++ b/src/main/java/CountTen.java
View file @dcc8c37
@@ -20,7 +20,7 @@ public class CountTen {
         Dataset<Row> df = spark.read().format("csv")
                 .option("inferSchema", "true")
                 .option("header", "true")
-                .load("./data/train.csv");
+                .load("./data/train_.csv");
         // cast timestamp to long
         Dataset<Row> newdf = df.withColumn("utc_click_time", df.col("click_time").cast("long"));
@@ -28,11 +28,14 @@ public class CountTen {
         newdf = newdf.drop("click_time").drop("attributed_time");
         WindowSpec w = Window.partitionBy("ip")
-                .orderBy("utc_click_time");
+                .orderBy("utc_click_time")
+                .rangeBetween(Window.currentRow(),Window.currentRow()+600);
 //                .rowsBetween(Window.currentRow(), Window.unboundedPreceding());   //Boundary end is not a valid integer: -9223372036854775808
         newdf = newdf.withColumn("is_clicked_in_ten_mins",
-                (lead(col("utc_click_time"),1).over(w).minus(col("utc_click_time")).lt((long)600)).cast("long"));
+                (count("utc_click_time").over(w)).minus(1));    //본인것 포함할 것인지 정해야함.
+//        newdf = newdf.withColumn("is_clicked_in_ten_mins",
+//                (lead(col("utc_click_time"),1).over(w).minus(col("utc_click_time")).lt((long)600)).cast("long"));
         newdf.where("ip == '117898'").show(false);
     }