Showing
3 changed files
with
49 additions
and
0 deletions
... | @@ -21,6 +21,16 @@ | ... | @@ -21,6 +21,16 @@ |
21 | <artifactId>spark-sql_2.11</artifactId> | 21 | <artifactId>spark-sql_2.11</artifactId> |
22 | <version>2.2.0</version> | 22 | <version>2.2.0</version> |
23 | </dependency> | 23 | </dependency> |
24 | + <dependency> | ||
25 | + <groupId>org.apache.spark</groupId> | ||
26 | + <artifactId>spark-sql_2.11</artifactId> | ||
27 | + <version>2.3.0</version> | ||
28 | + </dependency> | ||
29 | + <dependency> | ||
30 | + <groupId>com.databricks</groupId> | ||
31 | + <artifactId>spark-csv_2.11</artifactId> | ||
32 | + <version>1.5.0</version> | ||
33 | + </dependency> | ||
24 | 34 | ||
25 | </dependencies> | 35 | </dependencies> |
26 | 36 | ... | ... |
src/main/java/CountTen.java
0 → 100644
1 | +import org.apache.spark.sql.Column; | ||
2 | +import org.apache.spark.sql.Dataset; | ||
3 | +import org.apache.spark.sql.Row; | ||
4 | +import org.apache.spark.sql.SparkSession; | ||
5 | +import org.apache.spark.sql.expressions.Window; | ||
6 | +import org.apache.spark.sql.expressions.WindowSpec; | ||
7 | + | ||
8 | +import static org.apache.spark.sql.functions.*; | ||
9 | + | ||
10 | + | ||
11 | +public class CountTen { | ||
12 | + | ||
13 | + public static void main(String[] args) throws Exception { | ||
14 | + SparkSession spark = SparkSession | ||
15 | + .builder() | ||
16 | + .master("local") | ||
17 | + .appName("Java Spark SQL basic example") | ||
18 | + .getOrCreate(); | ||
19 | + | ||
20 | + Dataset<Row> df = spark.read().format("csv") | ||
21 | + .option("inferSchema", "true") | ||
22 | + .option("header", "true") | ||
23 | + .load("./data/train.csv"); | ||
24 | + | ||
25 | + // cast timestamp to long | ||
26 | + Dataset<Row> newdf = df.withColumn("utc_click_time", df.col("click_time").cast("long")); | ||
27 | + newdf = newdf.withColumn("utc_attributed_time", df.col("attributed_time").cast("long")); | ||
28 | + newdf = newdf.drop("click_time").drop("attributed_time"); | ||
29 | + | ||
30 | + WindowSpec w = Window.partitionBy("ip") | ||
31 | + .orderBy("utc_click_time"); | ||
32 | +// .rowsBetween(Window.currentRow(), Window.unboundedPreceding()); //Boundary end is not a valid integer: -9223372036854775808 | ||
33 | + | ||
34 | + newdf = newdf.withColumn("is_clicked_in_ten_mins", | ||
35 | + (lead(col("utc_click_time"),1).over(w).minus(col("utc_click_time")).lt((long)600)).cast("long")); | ||
36 | + | ||
37 | + newdf.where("ip == '117898'").show(false); | ||
38 | + } | ||
39 | +} | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or login to post a comment