신은섭(Shin Eun Seop)
Committed by GitHub

Merge pull request #18 from Java-Cesco/feature/#3

Aggregation
File mode changed
Detecting_fraud_clicks
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$" />
<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8">
<output url="file://$MODULE_DIR$/target/classes" />
<output-test url="file://$MODULE_DIR$/target/test-classes" />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
<sourceFolder url="file://$MODULE_DIR$/src/test/resources" type="java-test-resource" />
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
<excludeFolder url="file://$MODULE_DIR$/target" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" name="Maven: com.databricks:spark-csv_2.11:1.5.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-csv:1.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.spark:spark-core_2.11:2.3.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.avro:avro:1.7.7" level="project" />
<orderEntry type="library" name="Maven: org.codehaus.jackson:jackson-core-asl:1.9.13" level="project" />
<orderEntry type="library" name="Maven: org.codehaus.jackson:jackson-mapper-asl:1.9.13" level="project" />
<orderEntry type="library" name="Maven: com.thoughtworks.paranamer:paranamer:2.3" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-compress:1.4.1" level="project" />
<orderEntry type="library" name="Maven: org.tukaani:xz:1.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.avro:avro-mapred:hadoop2:1.7.7" level="project" />
<orderEntry type="library" name="Maven: org.apache.avro:avro-ipc:1.7.7" level="project" />
<orderEntry type="library" name="Maven: org.apache.avro:avro-ipc:tests:1.7.7" level="project" />
<orderEntry type="library" name="Maven: com.twitter:chill_2.11:0.8.4" level="project" />
<orderEntry type="library" name="Maven: com.esotericsoftware:kryo-shaded:3.0.3" level="project" />
<orderEntry type="library" name="Maven: com.esotericsoftware:minlog:1.3.0" level="project" />
<orderEntry type="library" name="Maven: org.objenesis:objenesis:2.1" level="project" />
<orderEntry type="library" name="Maven: com.twitter:chill-java:0.8.4" level="project" />
<orderEntry type="library" name="Maven: org.apache.xbean:xbean-asm5-shaded:4.4" level="project" />
<orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-client:2.6.5" level="project" />
<orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-common:2.6.5" level="project" />
<orderEntry type="library" name="Maven: commons-cli:commons-cli:1.2" level="project" />
<orderEntry type="library" name="Maven: xmlenc:xmlenc:0.52" level="project" />
<orderEntry type="library" name="Maven: commons-httpclient:commons-httpclient:3.1" level="project" />
<orderEntry type="library" name="Maven: commons-io:commons-io:2.4" level="project" />
<orderEntry type="library" name="Maven: commons-collections:commons-collections:3.2.2" level="project" />
<orderEntry type="library" name="Maven: commons-configuration:commons-configuration:1.6" level="project" />
<orderEntry type="library" name="Maven: commons-digester:commons-digester:1.8" level="project" />
<orderEntry type="library" name="Maven: commons-beanutils:commons-beanutils:1.7.0" level="project" />
<orderEntry type="library" name="Maven: commons-beanutils:commons-beanutils-core:1.8.0" level="project" />
<orderEntry type="library" name="Maven: com.google.code.gson:gson:2.2.4" level="project" />
<orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-auth:2.6.5" level="project" />
<orderEntry type="library" name="Maven: org.apache.directory.server:apacheds-kerberos-codec:2.0.0-M15" level="project" />
<orderEntry type="library" name="Maven: org.apache.directory.server:apacheds-i18n:2.0.0-M15" level="project" />
<orderEntry type="library" name="Maven: org.apache.directory.api:api-asn1-api:1.0.0-M20" level="project" />
<orderEntry type="library" name="Maven: org.apache.directory.api:api-util:1.0.0-M20" level="project" />
<orderEntry type="library" name="Maven: org.apache.curator:curator-client:2.6.0" level="project" />
<orderEntry type="library" name="Maven: org.htrace:htrace-core:3.0.4" level="project" />
<orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-hdfs:2.6.5" level="project" />
<orderEntry type="library" name="Maven: org.mortbay.jetty:jetty-util:6.1.26" level="project" />
<orderEntry type="library" name="Maven: xerces:xercesImpl:2.9.1" level="project" />
<orderEntry type="library" name="Maven: xml-apis:xml-apis:1.3.04" level="project" />
<orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-mapreduce-client-app:2.6.5" level="project" />
<orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-mapreduce-client-common:2.6.5" level="project" />
<orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-yarn-client:2.6.5" level="project" />
<orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-yarn-server-common:2.6.5" level="project" />
<orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-mapreduce-client-shuffle:2.6.5" level="project" />
<orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-yarn-api:2.6.5" level="project" />
<orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-mapreduce-client-core:2.6.5" level="project" />
<orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-yarn-common:2.6.5" level="project" />
<orderEntry type="library" name="Maven: javax.xml.bind:jaxb-api:2.2.2" level="project" />
<orderEntry type="library" name="Maven: javax.xml.stream:stax-api:1.0-2" level="project" />
<orderEntry type="library" name="Maven: org.codehaus.jackson:jackson-jaxrs:1.9.13" level="project" />
<orderEntry type="library" name="Maven: org.codehaus.jackson:jackson-xc:1.9.13" level="project" />
<orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-mapreduce-client-jobclient:2.6.5" level="project" />
<orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-annotations:2.6.5" level="project" />
<orderEntry type="library" name="Maven: org.apache.spark:spark-launcher_2.11:2.3.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.spark:spark-kvstore_2.11:2.3.0" level="project" />
<orderEntry type="library" name="Maven: org.fusesource.leveldbjni:leveldbjni-all:1.8" level="project" />
<orderEntry type="library" name="Maven: com.fasterxml.jackson.core:jackson-core:2.6.7" level="project" />
<orderEntry type="library" name="Maven: com.fasterxml.jackson.core:jackson-annotations:2.6.7" level="project" />
<orderEntry type="library" name="Maven: org.apache.spark:spark-network-common_2.11:2.3.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.spark:spark-network-shuffle_2.11:2.3.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.spark:spark-unsafe_2.11:2.3.0" level="project" />
<orderEntry type="library" name="Maven: net.java.dev.jets3t:jets3t:0.9.4" level="project" />
<orderEntry type="library" name="Maven: org.apache.httpcomponents:httpcore:4.4.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.httpcomponents:httpclient:4.5" level="project" />
<orderEntry type="library" name="Maven: commons-codec:commons-codec:2.0-SNAPSHOT" level="project" />
<orderEntry type="library" name="Maven: javax.activation:activation:1.1.1" level="project" />
<orderEntry type="library" name="Maven: org.bouncycastle:bcprov-jdk15on:1.52" level="project" />
<orderEntry type="library" name="Maven: com.jamesmurty.utils:java-xmlbuilder:1.1" level="project" />
<orderEntry type="library" name="Maven: net.iharder:base64:2.3.8" level="project" />
<orderEntry type="library" name="Maven: org.apache.curator:curator-recipes:2.6.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.curator:curator-framework:2.6.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.zookeeper:zookeeper:3.4.6" level="project" />
<orderEntry type="library" name="Maven: com.google.guava:guava:16.0.1" level="project" />
<orderEntry type="library" name="Maven: javax.servlet:javax.servlet-api:3.1.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-lang3:3.5" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-math3:3.4.1" level="project" />
<orderEntry type="library" name="Maven: com.google.code.findbugs:jsr305:1.3.9" level="project" />
<orderEntry type="library" name="Maven: org.slf4j:slf4j-api:1.7.16" level="project" />
<orderEntry type="library" name="Maven: org.slf4j:jul-to-slf4j:1.7.16" level="project" />
<orderEntry type="library" name="Maven: org.slf4j:jcl-over-slf4j:1.7.16" level="project" />
<orderEntry type="library" name="Maven: log4j:log4j:1.2.17" level="project" />
<orderEntry type="library" name="Maven: org.slf4j:slf4j-log4j12:1.7.16" level="project" />
<orderEntry type="library" name="Maven: com.ning:compress-lzf:1.0.3" level="project" />
<orderEntry type="library" name="Maven: org.xerial.snappy:snappy-java:1.1.2.6" level="project" />
<orderEntry type="library" name="Maven: org.lz4:lz4-java:1.4.0" level="project" />
<orderEntry type="library" name="Maven: com.github.luben:zstd-jni:1.3.2-2" level="project" />
<orderEntry type="library" name="Maven: org.roaringbitmap:RoaringBitmap:0.5.11" level="project" />
<orderEntry type="library" name="Maven: commons-net:commons-net:2.2" level="project" />
<orderEntry type="library" name="Maven: org.scala-lang:scala-library:2.11.8" level="project" />
<orderEntry type="library" name="Maven: org.json4s:json4s-jackson_2.11:3.2.11" level="project" />
<orderEntry type="library" name="Maven: org.json4s:json4s-core_2.11:3.2.11" level="project" />
<orderEntry type="library" name="Maven: org.json4s:json4s-ast_2.11:3.2.11" level="project" />
<orderEntry type="library" name="Maven: org.scala-lang:scalap:2.11.0" level="project" />
<orderEntry type="library" name="Maven: org.scala-lang:scala-compiler:2.11.0" level="project" />
<orderEntry type="library" name="Maven: org.scala-lang.modules:scala-xml_2.11:1.0.1" level="project" />
<orderEntry type="library" name="Maven: org.glassfish.jersey.core:jersey-client:2.22.2" level="project" />
<orderEntry type="library" name="Maven: javax.ws.rs:javax.ws.rs-api:2.0.1" level="project" />
<orderEntry type="library" name="Maven: org.glassfish.hk2:hk2-api:2.4.0-b34" level="project" />
<orderEntry type="library" name="Maven: org.glassfish.hk2:hk2-utils:2.4.0-b34" level="project" />
<orderEntry type="library" name="Maven: org.glassfish.hk2.external:aopalliance-repackaged:2.4.0-b34" level="project" />
<orderEntry type="library" name="Maven: org.glassfish.hk2.external:javax.inject:2.4.0-b34" level="project" />
<orderEntry type="library" name="Maven: org.glassfish.hk2:hk2-locator:2.4.0-b34" level="project" />
<orderEntry type="library" name="Maven: org.javassist:javassist:3.18.1-GA" level="project" />
<orderEntry type="library" name="Maven: org.glassfish.jersey.core:jersey-common:2.22.2" level="project" />
<orderEntry type="library" name="Maven: javax.annotation:javax.annotation-api:1.2" level="project" />
<orderEntry type="library" name="Maven: org.glassfish.jersey.bundles.repackaged:jersey-guava:2.22.2" level="project" />
<orderEntry type="library" name="Maven: org.glassfish.hk2:osgi-resource-locator:1.0.1" level="project" />
<orderEntry type="library" name="Maven: org.glassfish.jersey.core:jersey-server:2.22.2" level="project" />
<orderEntry type="library" name="Maven: org.glassfish.jersey.media:jersey-media-jaxb:2.22.2" level="project" />
<orderEntry type="library" name="Maven: javax.validation:validation-api:1.1.0.Final" level="project" />
<orderEntry type="library" name="Maven: org.glassfish.jersey.containers:jersey-container-servlet:2.22.2" level="project" />
<orderEntry type="library" name="Maven: org.glassfish.jersey.containers:jersey-container-servlet-core:2.22.2" level="project" />
<orderEntry type="library" name="Maven: io.netty:netty-all:4.1.17.Final" level="project" />
<orderEntry type="library" name="Maven: io.netty:netty:3.9.9.Final" level="project" />
<orderEntry type="library" name="Maven: com.clearspring.analytics:stream:2.7.0" level="project" />
<orderEntry type="library" name="Maven: io.dropwizard.metrics:metrics-core:3.1.5" level="project" />
<orderEntry type="library" name="Maven: io.dropwizard.metrics:metrics-jvm:3.1.5" level="project" />
<orderEntry type="library" name="Maven: io.dropwizard.metrics:metrics-json:3.1.5" level="project" />
<orderEntry type="library" name="Maven: io.dropwizard.metrics:metrics-graphite:3.1.5" level="project" />
<orderEntry type="library" name="Maven: com.fasterxml.jackson.core:jackson-databind:2.6.7.1" level="project" />
<orderEntry type="library" name="Maven: com.fasterxml.jackson.module:jackson-module-scala_2.11:2.6.7.1" level="project" />
<orderEntry type="library" name="Maven: org.scala-lang:scala-reflect:2.11.8" level="project" />
<orderEntry type="library" name="Maven: com.fasterxml.jackson.module:jackson-module-paranamer:2.7.9" level="project" />
<orderEntry type="library" name="Maven: org.apache.ivy:ivy:2.4.0" level="project" />
<orderEntry type="library" name="Maven: oro:oro:2.0.8" level="project" />
<orderEntry type="library" name="Maven: net.razorvine:pyrolite:4.13" level="project" />
<orderEntry type="library" name="Maven: net.sf.py4j:py4j:0.10.6" level="project" />
<orderEntry type="library" name="Maven: org.apache.spark:spark-tags_2.11:2.3.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-crypto:1.0.0" level="project" />
<orderEntry type="library" name="Maven: org.spark-project.spark:unused:1.0.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.spark:spark-sql_2.11:2.3.0" level="project" />
<orderEntry type="library" name="Maven: com.univocity:univocity-parsers:2.5.9" level="project" />
<orderEntry type="library" name="Maven: org.apache.spark:spark-sketch_2.11:2.3.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.spark:spark-catalyst_2.11:2.3.0" level="project" />
<orderEntry type="library" name="Maven: org.scala-lang.modules:scala-parser-combinators_2.11:1.0.4" level="project" />
<orderEntry type="library" name="Maven: org.codehaus.janino:janino:3.0.8" level="project" />
<orderEntry type="library" name="Maven: org.codehaus.janino:commons-compiler:3.0.8" level="project" />
<orderEntry type="library" name="Maven: org.antlr:antlr4-runtime:4.7" level="project" />
<orderEntry type="library" name="Maven: org.apache.orc:orc-core:nohive:1.4.1" level="project" />
<orderEntry type="library" name="Maven: com.google.protobuf:protobuf-java:2.5.0" level="project" />
<orderEntry type="library" name="Maven: commons-lang:commons-lang:2.6" level="project" />
<orderEntry type="library" name="Maven: io.airlift:aircompressor:0.8" level="project" />
<orderEntry type="library" name="Maven: org.apache.orc:orc-mapreduce:nohive:1.4.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.parquet:parquet-column:1.8.2" level="project" />
<orderEntry type="library" name="Maven: org.apache.parquet:parquet-common:1.8.2" level="project" />
<orderEntry type="library" name="Maven: org.apache.parquet:parquet-encoding:1.8.2" level="project" />
<orderEntry type="library" name="Maven: org.apache.parquet:parquet-hadoop:1.8.2" level="project" />
<orderEntry type="library" name="Maven: org.apache.parquet:parquet-format:2.3.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.parquet:parquet-jackson:1.8.2" level="project" />
<orderEntry type="library" name="Maven: org.apache.arrow:arrow-vector:0.8.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.arrow:arrow-format:0.8.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.arrow:arrow-memory:0.8.0" level="project" />
<orderEntry type="library" name="Maven: joda-time:joda-time:2.9.9" level="project" />
<orderEntry type="library" name="Maven: com.carrotsearch:hppc:0.7.2" level="project" />
<orderEntry type="library" name="Maven: com.vlkan:flatbuffers:1.2.0-3f79e055" level="project" />
<orderEntry type="library" name="Maven: com.databricks:spark-csv_2.11:1.5.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-csv:1.1" level="project" />
</component>
</module>
......
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="CompilerConfiguration">
<annotationProcessing>
<profile name="Maven default annotation processors profile" enabled="true">
<sourceOutputDir name="target/generated-sources/annotations" />
<sourceTestOutputDir name="target/generated-test-sources/test-annotations" />
<outputRelativeToContentRoot value="true" />
<module name="Detecting_fraud_clicks" />
</profile>
</annotationProcessing>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="MarkdownExportedFiles">
<htmlFiles />
<imageFiles />
<otherFiles />
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="JavaScriptSettings">
<option name="languageLevel" value="ES6" />
<component name="ExternalStorageConfigurationManager" enabled="true" />
<component name="MavenProjectsManager">
<option name="originalFiles">
<list>
<option value="$PROJECT_DIR$/pom.xml" />
</list>
</option>
</component>
<component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" project-jdk-name="1.8" project-jdk-type="JavaSDK">
<output url="file://$PROJECT_DIR$/out" />
</component>
<component name="MavenProjectsManager">
<option name="originalFiles">
<list>
<option value="$PROJECT_DIR$/pom.xml" />
</list>
</option>
</component>
<component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" default="false" project-jdk-name="1.8" project-jdk-type="JavaSDK">
<output url="file:///tmp" />
</component>
</project>
\ No newline at end of file
......
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/Detecting_fraud_clicks.iml" filepath="$PROJECT_DIR$/.idea/Detecting_fraud_clicks.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
......
File mode changed
File mode changed
......@@ -16,7 +16,35 @@
<artifactId>spark-core_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>com.databricks</groupId>
<artifactId>spark-csv_2.11</artifactId>
<version>1.5.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</build>
</project>
\ No newline at end of file
......
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.expressions.Window;
import org.apache.spark.sql.expressions.WindowSpec;
import static org.apache.spark.sql.functions.*;
import static org.apache.spark.sql.functions.lit;
import static org.apache.spark.sql.functions.when;
public class Aggregation {
public static void main(String[] args) throws Exception {
//Create Session
SparkSession spark = SparkSession
.builder()
.appName("Detecting Fraud Clicks")
.master("local")
.getOrCreate();
// Aggregation
Aggregation agg = new Aggregation();
Dataset<Row> dataset = agg.loadCSVDataSet("./train_sample.csv", spark);
dataset = agg.changeTimestempToLong(dataset);
dataset = agg.averageValidClickCount(dataset);
dataset = agg.clickTimeDelta(dataset);
dataset = agg.countClickInTenMinutes(dataset);
//test
dataset.where("ip == '5348' and app == '19'").show(10);
}
private Dataset<Row> loadCSVDataSet(String path, SparkSession spark){
// Read SCV to DataSet
return spark.read().format("csv")
.option("inferSchema", "true")
.option("header", "true")
.load(path);
}
private Dataset<Row> changeTimestempToLong(Dataset<Row> dataset){
// cast timestamp to long
Dataset<Row> newDF = dataset.withColumn("utc_click_time", dataset.col("click_time").cast("long"));
newDF = newDF.withColumn("utc_attributed_time", dataset.col("attributed_time").cast("long"));
newDF = newDF.drop("click_time").drop("attributed_time");
return newDF;
}
private Dataset<Row> averageValidClickCount(Dataset<Row> dataset){
// set Window partition by 'ip' and 'app' order by 'utc_click_time' select rows between 1st row to current row
WindowSpec w = Window.partitionBy("ip", "app")
.orderBy("utc_click_time")
.rowsBetween(Window.unboundedPreceding(), Window.currentRow());
// aggregation
Dataset<Row> newDF = dataset.withColumn("cum_count_click", count("utc_click_time").over(w));
newDF = newDF.withColumn("cum_sum_attributed", sum("is_attributed").over(w));
newDF = newDF.withColumn("avg_valid_click_count", col("cum_sum_attributed").divide(col("cum_count_click")));
newDF = newDF.drop("cum_count_click", "cum_sum_attributed");
return newDF;
}
private Dataset<Row> clickTimeDelta(Dataset<Row> dataset){
WindowSpec w = Window.partitionBy ("ip")
.orderBy("utc_click_time");
Dataset<Row> newDF = dataset.withColumn("lag(utc_click_time)", lag("utc_click_time",1).over(w));
newDF = newDF.withColumn("click_time_delta", when(col("lag(utc_click_time)").isNull(),
lit(0)).otherwise(col("utc_click_time")).minus(when(col("lag(utc_click_time)").isNull(),
lit(0)).otherwise(col("lag(utc_click_time)"))));
newDF = newDF.drop("lag(utc_click_time)");
return newDF;
}
private Dataset<Row> countClickInTenMinutes(Dataset<Row> dataset){
WindowSpec w = Window.partitionBy("ip")
.orderBy("utc_click_time")
.rangeBetween(Window.currentRow(),Window.currentRow()+600);
Dataset<Row> newDF = dataset.withColumn("count_click_in_ten_mins",
(count("utc_click_time").over(w)).minus(1)); //TODO 본인것 포함할 것인지 정해야함.
return newDF;
}
}
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
import java.util.Arrays;
import java.util.List;
public class MapExample {
static SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("Cesco");
static JavaSparkContext sc = new JavaSparkContext(conf);
public static void main(String[] args) throws Exception {
// Parallelized with 2 partitions
JavaRDD<String> x = sc.parallelize(
Arrays.asList("spark", "rdd", "example", "sample", "example"),
2);
// Word Count Map Example
JavaRDD<Tuple2<String, Integer>> y1 = x.map(e -> new Tuple2<>(e, 1));
List<Tuple2<String, Integer>> list1 = y1.collect();
// Another example of making tuple with string and it's length
JavaRDD<Tuple2<String, Integer>> y2 = x.map(e -> new Tuple2<>(e, e.length()));
List<Tuple2<String, Integer>> list2 = y2.collect();
System.out.println(list1);
}
}
public class valid {
private int x;
valid() {
x = 0;
}
void printX(){
System.out.println(x);
}
public static void main(String[] args){
valid v = new valid();
v.printX();
}
}
File mode changed
This diff could not be displayed because it is too large.