Showing
2 changed files
with
14 additions
and
3 deletions
data/train_.csv
0 → 100644
1 | +ip,app,device,os,channel,click_time,attributed_time,is_attributed | ||
2 | +117898,12,1,13,497,2017-11-07 09:30:38,,0 | ||
3 | +117898,12,1,13,497,2017-11-07 09:30:38,,0 | ||
4 | +117898,12,1,13,497,2017-11-07 09:31:38,,0 | ||
5 | +117898,12,1,13,497,2017-11-07 09:31:38,,0 | ||
6 | +117898,12,1,13,497,2017-11-07 09:31:38,,0 | ||
7 | +117898,12,1,13,497,2017-11-07 09:39:38,,0 | ||
8 | +117898,12,1,13,497,2017-11-07 09:40:38,,0 | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
... | @@ -20,7 +20,7 @@ public class CountTen { | ... | @@ -20,7 +20,7 @@ public class CountTen { |
20 | Dataset<Row> df = spark.read().format("csv") | 20 | Dataset<Row> df = spark.read().format("csv") |
21 | .option("inferSchema", "true") | 21 | .option("inferSchema", "true") |
22 | .option("header", "true") | 22 | .option("header", "true") |
23 | - .load("./data/train.csv"); | 23 | + .load("./data/train_.csv"); |
24 | 24 | ||
25 | // cast timestamp to long | 25 | // cast timestamp to long |
26 | Dataset<Row> newdf = df.withColumn("utc_click_time", df.col("click_time").cast("long")); | 26 | Dataset<Row> newdf = df.withColumn("utc_click_time", df.col("click_time").cast("long")); |
... | @@ -28,11 +28,14 @@ public class CountTen { | ... | @@ -28,11 +28,14 @@ public class CountTen { |
28 | newdf = newdf.drop("click_time").drop("attributed_time"); | 28 | newdf = newdf.drop("click_time").drop("attributed_time"); |
29 | 29 | ||
30 | WindowSpec w = Window.partitionBy("ip") | 30 | WindowSpec w = Window.partitionBy("ip") |
31 | - .orderBy("utc_click_time"); | 31 | + .orderBy("utc_click_time") |
32 | + .rangeBetween(Window.currentRow(),Window.currentRow()+600); | ||
32 | // .rowsBetween(Window.currentRow(), Window.unboundedPreceding()); //Boundary end is not a valid integer: -9223372036854775808 | 33 | // .rowsBetween(Window.currentRow(), Window.unboundedPreceding()); //Boundary end is not a valid integer: -9223372036854775808 |
33 | 34 | ||
34 | newdf = newdf.withColumn("is_clicked_in_ten_mins", | 35 | newdf = newdf.withColumn("is_clicked_in_ten_mins", |
35 | - (lead(col("utc_click_time"),1).over(w).minus(col("utc_click_time")).lt((long)600)).cast("long")); | 36 | + (count("utc_click_time").over(w)).minus(1)); //본인것 포함할 것인지 정해야함. |
37 | +// newdf = newdf.withColumn("is_clicked_in_ten_mins", | ||
38 | +// (lead(col("utc_click_time"),1).over(w).minus(col("utc_click_time")).lt((long)600)).cast("long")); | ||
36 | 39 | ||
37 | newdf.where("ip == '117898'").show(false); | 40 | newdf.where("ip == '117898'").show(false); |
38 | } | 41 | } | ... | ... |
-
Please register or login to post a comment