Merge pull request #17 from Java-Cesco/feature/tenMinsHG

Feature/ten mins hg

Merge pull request #17 from Java-Cesco/feature/tenMinsHG
Feature/ten mins hg
신은섭(Shin Eun Seop) · GitHub
Commit a0aa94695f74ac72c776bd79cb5346dcc8a870f3 a0aa9469 2 parents 41a22842 27b69035
Showing 6 changed files with 58 additions and 6 deletions
.idea/.name
.idea/Detecting_fraud_clicks.iml
data/train.csv
data/train_.csv
pom.xml
src/main/java/CountTen.java
--- a/.idea/.name 0 → 100644
View file @a0aa946
+++ b/.idea/.name 0 → 100644
View file @a0aa946
+Detecting_fraud_clicks
\ No newline at end of file
--- a/.idea/Detecting_fraud_clicks.iml
View file @a0aa946
+++ b/.idea/Detecting_fraud_clicks.iml
View file @a0aa946
@@ -172,4 +172,4 @@
     <orderEntry type="library" name="Maven: com.databricks:spark-csv_2.11:1.5.0" level="project" />
     <orderEntry type="library" name="Maven: org.apache.commons:commons-csv:1.1" level="project" />
   </component>
-</module>
\ No newline at end of file
+</module>
--- a/data/train.csv 0 → 100644
View file @a0aa946
+++ b/data/train.csv 0 → 100644
View file @a0aa946
--- a/data/train_.csv 0 → 100644
View file @a0aa946
+++ b/data/train_.csv 0 → 100644
View file @a0aa946
+ip,app,device,os,channel,click_time,attributed_time,is_attributed
+117898,12,1,13,497,2017-11-07 09:30:38,,0
+117898,12,1,13,497,2017-11-07 09:30:38,,0
+117898,12,1,13,497,2017-11-07 09:31:38,,0
+117898,12,1,13,497,2017-11-07 09:31:38,,0
+117898,12,1,13,497,2017-11-07 09:31:38,,0
+117898,12,1,13,497,2017-11-07 09:39:38,,0
+117898,12,1,13,497,2017-11-07 09:40:38,,0
\ No newline at end of file
--- a/pom.xml
View file @a0aa946
+++ b/pom.xml
View file @a0aa946
@@ -16,19 +16,21 @@
             <artifactId>spark-core_2.11</artifactId>
             <version>2.3.0</version>
         </dependency>
-
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-sql_2.11</artifactId>
+            <version>2.2.0</version>
+        </dependency>
         <dependency>
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-sql_2.11</artifactId>
             <version>2.3.0</version>
         </dependency>
-
         <dependency>
             <groupId>com.databricks</groupId>
             <artifactId>spark-csv_2.11</artifactId>
             <version>1.5.0</version>
         </dependency>
-
     </dependencies>
     <build>
@@ -44,6 +46,5 @@
             </plugin>
         </plugins>
     </build>
-
+    
-
 </project>
\ No newline at end of file
--- a/src/main/java/CountTen.java 0 → 100644
View file @a0aa946
+++ b/src/main/java/CountTen.java 0 → 100644
View file @a0aa946
+import org.apache.spark.sql.Column;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.expressions.Window;
+import org.apache.spark.sql.expressions.WindowSpec;
+
+import static org.apache.spark.sql.functions.*;
+
+
+public class CountTen {
+
+    public static void main(String[] args) throws Exception {
+        SparkSession spark = SparkSession
+                .builder()
+                .master("local")
+                .appName("Java Spark SQL basic example")
+                .getOrCreate();
+
+        Dataset<Row> df = spark.read().format("csv")
+                .option("inferSchema", "true")
+                .option("header", "true")
+                .load("./data/train_.csv");
+
+        // cast timestamp to long
+        Dataset<Row> newdf = df.withColumn("utc_click_time", df.col("click_time").cast("long"));
+        newdf = newdf.withColumn("utc_attributed_time", df.col("attributed_time").cast("long"));
+        newdf = newdf.drop("click_time").drop("attributed_time");
+
+        WindowSpec w = Window.partitionBy("ip")
+                .orderBy("utc_click_time")
+                .rangeBetween(Window.currentRow(),Window.currentRow()+600);
+//                .rowsBetween(Window.currentRow(), Window.unboundedPreceding());   //Boundary end is not a valid integer: -9223372036854775808
+
+        newdf = newdf.withColumn("is_clicked_in_ten_mins",
+                (count("utc_click_time").over(w)).minus(1));    //본인것 포함할 것인지 정해야함.
+//        newdf = newdf.withColumn("is_clicked_in_ten_mins",
+//                (lead(col("utc_click_time"),1).over(w).minus(col("utc_click_time")).lt((long)600)).cast("long"));
+
+        newdf.where("ip == '117898'").show(false);
+    }
+}
\ No newline at end of file