hyungyun.Moon

change lead to count

1 +ip,app,device,os,channel,click_time,attributed_time,is_attributed
2 +117898,12,1,13,497,2017-11-07 09:30:38,,0
3 +117898,12,1,13,497,2017-11-07 09:30:38,,0
4 +117898,12,1,13,497,2017-11-07 09:31:38,,0
5 +117898,12,1,13,497,2017-11-07 09:31:38,,0
6 +117898,12,1,13,497,2017-11-07 09:31:38,,0
7 +117898,12,1,13,497,2017-11-07 09:39:38,,0
8 +117898,12,1,13,497,2017-11-07 09:40:38,,0
...\ No newline at end of file ...\ No newline at end of file
...@@ -20,7 +20,7 @@ public class CountTen { ...@@ -20,7 +20,7 @@ public class CountTen {
20 Dataset<Row> df = spark.read().format("csv") 20 Dataset<Row> df = spark.read().format("csv")
21 .option("inferSchema", "true") 21 .option("inferSchema", "true")
22 .option("header", "true") 22 .option("header", "true")
23 - .load("./data/train.csv"); 23 + .load("./data/train_.csv");
24 24
25 // cast timestamp to long 25 // cast timestamp to long
26 Dataset<Row> newdf = df.withColumn("utc_click_time", df.col("click_time").cast("long")); 26 Dataset<Row> newdf = df.withColumn("utc_click_time", df.col("click_time").cast("long"));
...@@ -28,11 +28,14 @@ public class CountTen { ...@@ -28,11 +28,14 @@ public class CountTen {
28 newdf = newdf.drop("click_time").drop("attributed_time"); 28 newdf = newdf.drop("click_time").drop("attributed_time");
29 29
30 WindowSpec w = Window.partitionBy("ip") 30 WindowSpec w = Window.partitionBy("ip")
31 - .orderBy("utc_click_time"); 31 + .orderBy("utc_click_time")
32 + .rangeBetween(Window.currentRow(),Window.currentRow()+600);
32 // .rowsBetween(Window.currentRow(), Window.unboundedPreceding()); //Boundary end is not a valid integer: -9223372036854775808 33 // .rowsBetween(Window.currentRow(), Window.unboundedPreceding()); //Boundary end is not a valid integer: -9223372036854775808
33 34
34 newdf = newdf.withColumn("is_clicked_in_ten_mins", 35 newdf = newdf.withColumn("is_clicked_in_ten_mins",
35 - (lead(col("utc_click_time"),1).over(w).minus(col("utc_click_time")).lt((long)600)).cast("long")); 36 + (count("utc_click_time").over(w)).minus(1)); //본인것 포함할 것인지 정해야함.
37 +// newdf = newdf.withColumn("is_clicked_in_ten_mins",
38 +// (lead(col("utc_click_time"),1).over(w).minus(col("utc_click_time")).lt((long)600)).cast("long"));
36 39
37 newdf.where("ip == '117898'").show(false); 40 newdf.where("ip == '117898'").show(false);
38 } 41 }
......