신은섭(Shin Eun Seop)
Committed by GitHub

Merge pull request #17 from Java-Cesco/feature/tenMinsHG

Feature/ten mins hg
1 +Detecting_fraud_clicks
...\ No newline at end of file ...\ No newline at end of file
...@@ -172,4 +172,4 @@ ...@@ -172,4 +172,4 @@
172 <orderEntry type="library" name="Maven: com.databricks:spark-csv_2.11:1.5.0" level="project" /> 172 <orderEntry type="library" name="Maven: com.databricks:spark-csv_2.11:1.5.0" level="project" />
173 <orderEntry type="library" name="Maven: org.apache.commons:commons-csv:1.1" level="project" /> 173 <orderEntry type="library" name="Maven: org.apache.commons:commons-csv:1.1" level="project" />
174 </component> 174 </component>
175 -</module>
...\ No newline at end of file ...\ No newline at end of file
175 +</module>
......
This diff could not be displayed because it is too large.
1 +ip,app,device,os,channel,click_time,attributed_time,is_attributed
2 +117898,12,1,13,497,2017-11-07 09:30:38,,0
3 +117898,12,1,13,497,2017-11-07 09:30:38,,0
4 +117898,12,1,13,497,2017-11-07 09:31:38,,0
5 +117898,12,1,13,497,2017-11-07 09:31:38,,0
6 +117898,12,1,13,497,2017-11-07 09:31:38,,0
7 +117898,12,1,13,497,2017-11-07 09:39:38,,0
8 +117898,12,1,13,497,2017-11-07 09:40:38,,0
...\ No newline at end of file ...\ No newline at end of file
...@@ -16,19 +16,21 @@ ...@@ -16,19 +16,21 @@
16 <artifactId>spark-core_2.11</artifactId> 16 <artifactId>spark-core_2.11</artifactId>
17 <version>2.3.0</version> 17 <version>2.3.0</version>
18 </dependency> 18 </dependency>
19 - 19 + <dependency>
20 + <groupId>org.apache.spark</groupId>
21 + <artifactId>spark-sql_2.11</artifactId>
22 + <version>2.2.0</version>
23 + </dependency>
20 <dependency> 24 <dependency>
21 <groupId>org.apache.spark</groupId> 25 <groupId>org.apache.spark</groupId>
22 <artifactId>spark-sql_2.11</artifactId> 26 <artifactId>spark-sql_2.11</artifactId>
23 <version>2.3.0</version> 27 <version>2.3.0</version>
24 </dependency> 28 </dependency>
25 -
26 <dependency> 29 <dependency>
27 <groupId>com.databricks</groupId> 30 <groupId>com.databricks</groupId>
28 <artifactId>spark-csv_2.11</artifactId> 31 <artifactId>spark-csv_2.11</artifactId>
29 <version>1.5.0</version> 32 <version>1.5.0</version>
30 </dependency> 33 </dependency>
31 -
32 </dependencies> 34 </dependencies>
33 35
34 <build> 36 <build>
...@@ -44,6 +46,5 @@ ...@@ -44,6 +46,5 @@
44 </plugin> 46 </plugin>
45 </plugins> 47 </plugins>
46 </build> 48 </build>
47 - 49 +
48 -
49 </project> 50 </project>
...\ No newline at end of file ...\ No newline at end of file
......
1 +import org.apache.spark.sql.Column;
2 +import org.apache.spark.sql.Dataset;
3 +import org.apache.spark.sql.Row;
4 +import org.apache.spark.sql.SparkSession;
5 +import org.apache.spark.sql.expressions.Window;
6 +import org.apache.spark.sql.expressions.WindowSpec;
7 +
8 +import static org.apache.spark.sql.functions.*;
9 +
10 +
11 +public class CountTen {
12 +
13 + public static void main(String[] args) throws Exception {
14 + SparkSession spark = SparkSession
15 + .builder()
16 + .master("local")
17 + .appName("Java Spark SQL basic example")
18 + .getOrCreate();
19 +
20 + Dataset<Row> df = spark.read().format("csv")
21 + .option("inferSchema", "true")
22 + .option("header", "true")
23 + .load("./data/train_.csv");
24 +
25 + // cast timestamp to long
26 + Dataset<Row> newdf = df.withColumn("utc_click_time", df.col("click_time").cast("long"));
27 + newdf = newdf.withColumn("utc_attributed_time", df.col("attributed_time").cast("long"));
28 + newdf = newdf.drop("click_time").drop("attributed_time");
29 +
30 + WindowSpec w = Window.partitionBy("ip")
31 + .orderBy("utc_click_time")
32 + .rangeBetween(Window.currentRow(),Window.currentRow()+600);
33 +// .rowsBetween(Window.currentRow(), Window.unboundedPreceding()); //Boundary end is not a valid integer: -9223372036854775808
34 +
35 + newdf = newdf.withColumn("is_clicked_in_ten_mins",
36 + (count("utc_click_time").over(w)).minus(1)); //본인것 포함할 것인지 정해야함.
37 +// newdf = newdf.withColumn("is_clicked_in_ten_mins",
38 +// (lead(col("utc_click_time"),1).over(w).minus(col("utc_click_time")).lt((long)600)).cast("long"));
39 +
40 + newdf.where("ip == '117898'").show(false);
41 + }
42 +}
...\ No newline at end of file ...\ No newline at end of file