신은섭(Shin Eun Seop)
Committed by GitHub

Merge pull request #17 from Java-Cesco/feature/tenMinsHG

Feature/ten mins hg
Detecting_fraud_clicks
\ No newline at end of file
This diff could not be displayed because it is too large.
ip,app,device,os,channel,click_time,attributed_time,is_attributed
117898,12,1,13,497,2017-11-07 09:30:38,,0
117898,12,1,13,497,2017-11-07 09:30:38,,0
117898,12,1,13,497,2017-11-07 09:31:38,,0
117898,12,1,13,497,2017-11-07 09:31:38,,0
117898,12,1,13,497,2017-11-07 09:31:38,,0
117898,12,1,13,497,2017-11-07 09:39:38,,0
117898,12,1,13,497,2017-11-07 09:40:38,,0
\ No newline at end of file
......@@ -16,19 +16,21 @@
<artifactId>spark-core_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>com.databricks</groupId>
<artifactId>spark-csv_2.11</artifactId>
<version>1.5.0</version>
</dependency>
</dependencies>
<build>
......@@ -45,5 +47,4 @@
</plugins>
</build>
</project>
\ No newline at end of file
......
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.expressions.Window;
import org.apache.spark.sql.expressions.WindowSpec;
import static org.apache.spark.sql.functions.*;
public class CountTen {
public static void main(String[] args) throws Exception {
SparkSession spark = SparkSession
.builder()
.master("local")
.appName("Java Spark SQL basic example")
.getOrCreate();
Dataset<Row> df = spark.read().format("csv")
.option("inferSchema", "true")
.option("header", "true")
.load("./data/train_.csv");
// cast timestamp to long
Dataset<Row> newdf = df.withColumn("utc_click_time", df.col("click_time").cast("long"));
newdf = newdf.withColumn("utc_attributed_time", df.col("attributed_time").cast("long"));
newdf = newdf.drop("click_time").drop("attributed_time");
WindowSpec w = Window.partitionBy("ip")
.orderBy("utc_click_time")
.rangeBetween(Window.currentRow(),Window.currentRow()+600);
// .rowsBetween(Window.currentRow(), Window.unboundedPreceding()); //Boundary end is not a valid integer: -9223372036854775808
newdf = newdf.withColumn("is_clicked_in_ten_mins",
(count("utc_click_time").over(w)).minus(1)); //본인것 포함할 것인지 정해야함.
// newdf = newdf.withColumn("is_clicked_in_ten_mins",
// (lead(col("utc_click_time"),1).over(w).minus(col("utc_click_time")).lt((long)600)).cast("long"));
newdf.where("ip == '117898'").show(false);
}
}
\ No newline at end of file