신은섭(Shin Eun Seop)

Merge branch 'develop' into ml

File mode changed
1 <?xml version="1.0" encoding="UTF-8"?> 1 <?xml version="1.0" encoding="UTF-8"?>
2 -<module type="JAVA_MODULE" version="4" />
...\ No newline at end of file ...\ No newline at end of file
2 +<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
3 + <component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8">
4 + <output url="file://$MODULE_DIR$/target/classes" />
5 + <output-test url="file://$MODULE_DIR$/target/test-classes" />
6 + <content url="file://$MODULE_DIR$">
7 + <sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
8 + <sourceFolder url="file://$MODULE_DIR$/src/test/resources" type="java-test-resource" />
9 + <sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
10 + <sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
11 + <excludeFolder url="file://$MODULE_DIR$/target" />
12 + </content>
13 + <orderEntry type="inheritedJdk" />
14 + <orderEntry type="sourceFolder" forTests="false" />
15 + <orderEntry type="library" name="Maven: com.databricks:spark-csv_2.11:1.5.0" level="project" />
16 + <orderEntry type="library" name="Maven: org.apache.commons:commons-csv:1.1" level="project" />
17 + <orderEntry type="library" name="Maven: org.apache.spark:spark-core_2.11:2.3.0" level="project" />
18 + <orderEntry type="library" name="Maven: org.apache.avro:avro:1.7.7" level="project" />
19 + <orderEntry type="library" name="Maven: org.codehaus.jackson:jackson-core-asl:1.9.13" level="project" />
20 + <orderEntry type="library" name="Maven: org.codehaus.jackson:jackson-mapper-asl:1.9.13" level="project" />
21 + <orderEntry type="library" name="Maven: com.thoughtworks.paranamer:paranamer:2.3" level="project" />
22 + <orderEntry type="library" name="Maven: org.apache.commons:commons-compress:1.4.1" level="project" />
23 + <orderEntry type="library" name="Maven: org.tukaani:xz:1.0" level="project" />
24 + <orderEntry type="library" name="Maven: org.apache.avro:avro-mapred:hadoop2:1.7.7" level="project" />
25 + <orderEntry type="library" name="Maven: org.apache.avro:avro-ipc:1.7.7" level="project" />
26 + <orderEntry type="library" name="Maven: org.apache.avro:avro-ipc:tests:1.7.7" level="project" />
27 + <orderEntry type="library" name="Maven: com.twitter:chill_2.11:0.8.4" level="project" />
28 + <orderEntry type="library" name="Maven: com.esotericsoftware:kryo-shaded:3.0.3" level="project" />
29 + <orderEntry type="library" name="Maven: com.esotericsoftware:minlog:1.3.0" level="project" />
30 + <orderEntry type="library" name="Maven: org.objenesis:objenesis:2.1" level="project" />
31 + <orderEntry type="library" name="Maven: com.twitter:chill-java:0.8.4" level="project" />
32 + <orderEntry type="library" name="Maven: org.apache.xbean:xbean-asm5-shaded:4.4" level="project" />
33 + <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-client:2.6.5" level="project" />
34 + <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-common:2.6.5" level="project" />
35 + <orderEntry type="library" name="Maven: commons-cli:commons-cli:1.2" level="project" />
36 + <orderEntry type="library" name="Maven: xmlenc:xmlenc:0.52" level="project" />
37 + <orderEntry type="library" name="Maven: commons-httpclient:commons-httpclient:3.1" level="project" />
38 + <orderEntry type="library" name="Maven: commons-io:commons-io:2.4" level="project" />
39 + <orderEntry type="library" name="Maven: commons-collections:commons-collections:3.2.2" level="project" />
40 + <orderEntry type="library" name="Maven: commons-configuration:commons-configuration:1.6" level="project" />
41 + <orderEntry type="library" name="Maven: commons-digester:commons-digester:1.8" level="project" />
42 + <orderEntry type="library" name="Maven: commons-beanutils:commons-beanutils:1.7.0" level="project" />
43 + <orderEntry type="library" name="Maven: commons-beanutils:commons-beanutils-core:1.8.0" level="project" />
44 + <orderEntry type="library" name="Maven: com.google.code.gson:gson:2.2.4" level="project" />
45 + <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-auth:2.6.5" level="project" />
46 + <orderEntry type="library" name="Maven: org.apache.directory.server:apacheds-kerberos-codec:2.0.0-M15" level="project" />
47 + <orderEntry type="library" name="Maven: org.apache.directory.server:apacheds-i18n:2.0.0-M15" level="project" />
48 + <orderEntry type="library" name="Maven: org.apache.directory.api:api-asn1-api:1.0.0-M20" level="project" />
49 + <orderEntry type="library" name="Maven: org.apache.directory.api:api-util:1.0.0-M20" level="project" />
50 + <orderEntry type="library" name="Maven: org.apache.curator:curator-client:2.6.0" level="project" />
51 + <orderEntry type="library" name="Maven: org.htrace:htrace-core:3.0.4" level="project" />
52 + <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-hdfs:2.6.5" level="project" />
53 + <orderEntry type="library" name="Maven: org.mortbay.jetty:jetty-util:6.1.26" level="project" />
54 + <orderEntry type="library" name="Maven: xerces:xercesImpl:2.9.1" level="project" />
55 + <orderEntry type="library" name="Maven: xml-apis:xml-apis:1.3.04" level="project" />
56 + <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-mapreduce-client-app:2.6.5" level="project" />
57 + <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-mapreduce-client-common:2.6.5" level="project" />
58 + <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-yarn-client:2.6.5" level="project" />
59 + <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-yarn-server-common:2.6.5" level="project" />
60 + <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-mapreduce-client-shuffle:2.6.5" level="project" />
61 + <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-yarn-api:2.6.5" level="project" />
62 + <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-mapreduce-client-core:2.6.5" level="project" />
63 + <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-yarn-common:2.6.5" level="project" />
64 + <orderEntry type="library" name="Maven: javax.xml.bind:jaxb-api:2.2.2" level="project" />
65 + <orderEntry type="library" name="Maven: javax.xml.stream:stax-api:1.0-2" level="project" />
66 + <orderEntry type="library" name="Maven: org.codehaus.jackson:jackson-jaxrs:1.9.13" level="project" />
67 + <orderEntry type="library" name="Maven: org.codehaus.jackson:jackson-xc:1.9.13" level="project" />
68 + <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-mapreduce-client-jobclient:2.6.5" level="project" />
69 + <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-annotations:2.6.5" level="project" />
70 + <orderEntry type="library" name="Maven: org.apache.spark:spark-launcher_2.11:2.3.0" level="project" />
71 + <orderEntry type="library" name="Maven: org.apache.spark:spark-kvstore_2.11:2.3.0" level="project" />
72 + <orderEntry type="library" name="Maven: org.fusesource.leveldbjni:leveldbjni-all:1.8" level="project" />
73 + <orderEntry type="library" name="Maven: com.fasterxml.jackson.core:jackson-core:2.6.7" level="project" />
74 + <orderEntry type="library" name="Maven: com.fasterxml.jackson.core:jackson-annotations:2.6.7" level="project" />
75 + <orderEntry type="library" name="Maven: org.apache.spark:spark-network-common_2.11:2.3.0" level="project" />
76 + <orderEntry type="library" name="Maven: org.apache.spark:spark-network-shuffle_2.11:2.3.0" level="project" />
77 + <orderEntry type="library" name="Maven: org.apache.spark:spark-unsafe_2.11:2.3.0" level="project" />
78 + <orderEntry type="library" name="Maven: net.java.dev.jets3t:jets3t:0.9.4" level="project" />
79 + <orderEntry type="library" name="Maven: org.apache.httpcomponents:httpcore:4.4.1" level="project" />
80 + <orderEntry type="library" name="Maven: org.apache.httpcomponents:httpclient:4.5" level="project" />
81 + <orderEntry type="library" name="Maven: commons-codec:commons-codec:2.0-SNAPSHOT" level="project" />
82 + <orderEntry type="library" name="Maven: javax.activation:activation:1.1.1" level="project" />
83 + <orderEntry type="library" name="Maven: org.bouncycastle:bcprov-jdk15on:1.52" level="project" />
84 + <orderEntry type="library" name="Maven: com.jamesmurty.utils:java-xmlbuilder:1.1" level="project" />
85 + <orderEntry type="library" name="Maven: net.iharder:base64:2.3.8" level="project" />
86 + <orderEntry type="library" name="Maven: org.apache.curator:curator-recipes:2.6.0" level="project" />
87 + <orderEntry type="library" name="Maven: org.apache.curator:curator-framework:2.6.0" level="project" />
88 + <orderEntry type="library" name="Maven: org.apache.zookeeper:zookeeper:3.4.6" level="project" />
89 + <orderEntry type="library" name="Maven: com.google.guava:guava:16.0.1" level="project" />
90 + <orderEntry type="library" name="Maven: javax.servlet:javax.servlet-api:3.1.0" level="project" />
91 + <orderEntry type="library" name="Maven: org.apache.commons:commons-lang3:3.5" level="project" />
92 + <orderEntry type="library" name="Maven: org.apache.commons:commons-math3:3.4.1" level="project" />
93 + <orderEntry type="library" name="Maven: com.google.code.findbugs:jsr305:1.3.9" level="project" />
94 + <orderEntry type="library" name="Maven: org.slf4j:slf4j-api:1.7.16" level="project" />
95 + <orderEntry type="library" name="Maven: org.slf4j:jul-to-slf4j:1.7.16" level="project" />
96 + <orderEntry type="library" name="Maven: org.slf4j:jcl-over-slf4j:1.7.16" level="project" />
97 + <orderEntry type="library" name="Maven: log4j:log4j:1.2.17" level="project" />
98 + <orderEntry type="library" name="Maven: org.slf4j:slf4j-log4j12:1.7.16" level="project" />
99 + <orderEntry type="library" name="Maven: com.ning:compress-lzf:1.0.3" level="project" />
100 + <orderEntry type="library" name="Maven: org.xerial.snappy:snappy-java:1.1.2.6" level="project" />
101 + <orderEntry type="library" name="Maven: org.lz4:lz4-java:1.4.0" level="project" />
102 + <orderEntry type="library" name="Maven: com.github.luben:zstd-jni:1.3.2-2" level="project" />
103 + <orderEntry type="library" name="Maven: org.roaringbitmap:RoaringBitmap:0.5.11" level="project" />
104 + <orderEntry type="library" name="Maven: commons-net:commons-net:2.2" level="project" />
105 + <orderEntry type="library" name="Maven: org.scala-lang:scala-library:2.11.8" level="project" />
106 + <orderEntry type="library" name="Maven: org.json4s:json4s-jackson_2.11:3.2.11" level="project" />
107 + <orderEntry type="library" name="Maven: org.json4s:json4s-core_2.11:3.2.11" level="project" />
108 + <orderEntry type="library" name="Maven: org.json4s:json4s-ast_2.11:3.2.11" level="project" />
109 + <orderEntry type="library" name="Maven: org.scala-lang:scalap:2.11.0" level="project" />
110 + <orderEntry type="library" name="Maven: org.scala-lang:scala-compiler:2.11.0" level="project" />
111 + <orderEntry type="library" name="Maven: org.scala-lang.modules:scala-xml_2.11:1.0.1" level="project" />
112 + <orderEntry type="library" name="Maven: org.glassfish.jersey.core:jersey-client:2.22.2" level="project" />
113 + <orderEntry type="library" name="Maven: javax.ws.rs:javax.ws.rs-api:2.0.1" level="project" />
114 + <orderEntry type="library" name="Maven: org.glassfish.hk2:hk2-api:2.4.0-b34" level="project" />
115 + <orderEntry type="library" name="Maven: org.glassfish.hk2:hk2-utils:2.4.0-b34" level="project" />
116 + <orderEntry type="library" name="Maven: org.glassfish.hk2.external:aopalliance-repackaged:2.4.0-b34" level="project" />
117 + <orderEntry type="library" name="Maven: org.glassfish.hk2.external:javax.inject:2.4.0-b34" level="project" />
118 + <orderEntry type="library" name="Maven: org.glassfish.hk2:hk2-locator:2.4.0-b34" level="project" />
119 + <orderEntry type="library" name="Maven: org.javassist:javassist:3.18.1-GA" level="project" />
120 + <orderEntry type="library" name="Maven: org.glassfish.jersey.core:jersey-common:2.22.2" level="project" />
121 + <orderEntry type="library" name="Maven: javax.annotation:javax.annotation-api:1.2" level="project" />
122 + <orderEntry type="library" name="Maven: org.glassfish.jersey.bundles.repackaged:jersey-guava:2.22.2" level="project" />
123 + <orderEntry type="library" name="Maven: org.glassfish.hk2:osgi-resource-locator:1.0.1" level="project" />
124 + <orderEntry type="library" name="Maven: org.glassfish.jersey.core:jersey-server:2.22.2" level="project" />
125 + <orderEntry type="library" name="Maven: org.glassfish.jersey.media:jersey-media-jaxb:2.22.2" level="project" />
126 + <orderEntry type="library" name="Maven: javax.validation:validation-api:1.1.0.Final" level="project" />
127 + <orderEntry type="library" name="Maven: org.glassfish.jersey.containers:jersey-container-servlet:2.22.2" level="project" />
128 + <orderEntry type="library" name="Maven: org.glassfish.jersey.containers:jersey-container-servlet-core:2.22.2" level="project" />
129 + <orderEntry type="library" name="Maven: io.netty:netty-all:4.1.17.Final" level="project" />
130 + <orderEntry type="library" name="Maven: io.netty:netty:3.9.9.Final" level="project" />
131 + <orderEntry type="library" name="Maven: com.clearspring.analytics:stream:2.7.0" level="project" />
132 + <orderEntry type="library" name="Maven: io.dropwizard.metrics:metrics-core:3.1.5" level="project" />
133 + <orderEntry type="library" name="Maven: io.dropwizard.metrics:metrics-jvm:3.1.5" level="project" />
134 + <orderEntry type="library" name="Maven: io.dropwizard.metrics:metrics-json:3.1.5" level="project" />
135 + <orderEntry type="library" name="Maven: io.dropwizard.metrics:metrics-graphite:3.1.5" level="project" />
136 + <orderEntry type="library" name="Maven: com.fasterxml.jackson.core:jackson-databind:2.6.7.1" level="project" />
137 + <orderEntry type="library" name="Maven: com.fasterxml.jackson.module:jackson-module-scala_2.11:2.6.7.1" level="project" />
138 + <orderEntry type="library" name="Maven: org.scala-lang:scala-reflect:2.11.8" level="project" />
139 + <orderEntry type="library" name="Maven: com.fasterxml.jackson.module:jackson-module-paranamer:2.7.9" level="project" />
140 + <orderEntry type="library" name="Maven: org.apache.ivy:ivy:2.4.0" level="project" />
141 + <orderEntry type="library" name="Maven: oro:oro:2.0.8" level="project" />
142 + <orderEntry type="library" name="Maven: net.razorvine:pyrolite:4.13" level="project" />
143 + <orderEntry type="library" name="Maven: net.sf.py4j:py4j:0.10.6" level="project" />
144 + <orderEntry type="library" name="Maven: org.apache.spark:spark-tags_2.11:2.3.0" level="project" />
145 + <orderEntry type="library" name="Maven: org.apache.commons:commons-crypto:1.0.0" level="project" />
146 + <orderEntry type="library" name="Maven: org.spark-project.spark:unused:1.0.0" level="project" />
147 + <orderEntry type="library" name="Maven: org.apache.spark:spark-sql_2.11:2.3.0" level="project" />
148 + <orderEntry type="library" name="Maven: com.univocity:univocity-parsers:2.5.9" level="project" />
149 + <orderEntry type="library" name="Maven: org.apache.spark:spark-sketch_2.11:2.3.0" level="project" />
150 + <orderEntry type="library" name="Maven: org.apache.spark:spark-catalyst_2.11:2.3.0" level="project" />
151 + <orderEntry type="library" name="Maven: org.scala-lang.modules:scala-parser-combinators_2.11:1.0.4" level="project" />
152 + <orderEntry type="library" name="Maven: org.codehaus.janino:janino:3.0.8" level="project" />
153 + <orderEntry type="library" name="Maven: org.codehaus.janino:commons-compiler:3.0.8" level="project" />
154 + <orderEntry type="library" name="Maven: org.antlr:antlr4-runtime:4.7" level="project" />
155 + <orderEntry type="library" name="Maven: org.apache.orc:orc-core:nohive:1.4.1" level="project" />
156 + <orderEntry type="library" name="Maven: com.google.protobuf:protobuf-java:2.5.0" level="project" />
157 + <orderEntry type="library" name="Maven: commons-lang:commons-lang:2.6" level="project" />
158 + <orderEntry type="library" name="Maven: io.airlift:aircompressor:0.8" level="project" />
159 + <orderEntry type="library" name="Maven: org.apache.orc:orc-mapreduce:nohive:1.4.1" level="project" />
160 + <orderEntry type="library" name="Maven: org.apache.parquet:parquet-column:1.8.2" level="project" />
161 + <orderEntry type="library" name="Maven: org.apache.parquet:parquet-common:1.8.2" level="project" />
162 + <orderEntry type="library" name="Maven: org.apache.parquet:parquet-encoding:1.8.2" level="project" />
163 + <orderEntry type="library" name="Maven: org.apache.parquet:parquet-hadoop:1.8.2" level="project" />
164 + <orderEntry type="library" name="Maven: org.apache.parquet:parquet-format:2.3.1" level="project" />
165 + <orderEntry type="library" name="Maven: org.apache.parquet:parquet-jackson:1.8.2" level="project" />
166 + <orderEntry type="library" name="Maven: org.apache.arrow:arrow-vector:0.8.0" level="project" />
167 + <orderEntry type="library" name="Maven: org.apache.arrow:arrow-format:0.8.0" level="project" />
168 + <orderEntry type="library" name="Maven: org.apache.arrow:arrow-memory:0.8.0" level="project" />
169 + <orderEntry type="library" name="Maven: joda-time:joda-time:2.9.9" level="project" />
170 + <orderEntry type="library" name="Maven: com.carrotsearch:hppc:0.7.2" level="project" />
171 + <orderEntry type="library" name="Maven: com.vlkan:flatbuffers:1.2.0-3f79e055" level="project" />
172 + <orderEntry type="library" name="Maven: com.databricks:spark-csv_2.11:1.5.0" level="project" />
173 + <orderEntry type="library" name="Maven: org.apache.commons:commons-csv:1.1" level="project" />
174 + </component>
175 +</module>
......
1 +<?xml version="1.0" encoding="UTF-8"?>
2 +<project version="4">
3 + <component name="MarkdownExportedFiles">
4 + <htmlFiles />
5 + <imageFiles />
6 + <otherFiles />
7 + </component>
8 +</project>
...\ No newline at end of file ...\ No newline at end of file
...@@ -8,7 +8,17 @@ ...@@ -8,7 +8,17 @@
8 </list> 8 </list>
9 </option> 9 </option>
10 </component> 10 </component>
11 - <component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" default="true" project-jdk-name="1.8" project-jdk-type="JavaSDK"> 11 + <component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" project-jdk-name="1.8" project-jdk-type="JavaSDK">
12 <output url="file://$PROJECT_DIR$/out" /> 12 <output url="file://$PROJECT_DIR$/out" />
13 </component> 13 </component>
14 + <component name="MavenProjectsManager">
15 + <option name="originalFiles">
16 + <list>
17 + <option value="$PROJECT_DIR$/pom.xml" />
18 + </list>
19 + </option>
20 + </component>
21 + <component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" default="false" project-jdk-name="1.8" project-jdk-type="JavaSDK">
22 + <output url="file:///tmp" />
23 + </component>
14 </project> 24 </project>
...\ No newline at end of file ...\ No newline at end of file
......
File mode changed
File mode changed
...@@ -26,7 +26,30 @@ ...@@ -26,7 +26,30 @@
26 <artifactId>spark-mllib_2.11</artifactId> 26 <artifactId>spark-mllib_2.11</artifactId>
27 <version>2.3.0</version> 27 <version>2.3.0</version>
28 </dependency> 28 </dependency>
29 - 29 + <dependency>
30 + <groupId>org.apache.spark</groupId>
31 + <artifactId>spark-sql_2.11</artifactId>
32 + <version>2.3.0</version>
33 + </dependency>
34 + <dependency>
35 + <groupId>com.databricks</groupId>
36 + <artifactId>spark-csv_2.11</artifactId>
37 + <version>1.5.0</version>
38 + </dependency>
30 </dependencies> 39 </dependencies>
31 40
41 + <build>
42 + <plugins>
43 + <plugin>
44 + <groupId>org.apache.maven.plugins</groupId>
45 + <artifactId>maven-compiler-plugin</artifactId>
46 + <version>3.6.1</version>
47 + <configuration>
48 + <source>1.8</source>
49 + <target>1.8</target>
50 + </configuration>
51 + </plugin>
52 + </plugins>
53 + </build>
54 +
32 </project> 55 </project>
...\ No newline at end of file ...\ No newline at end of file
......
1 +import org.apache.spark.sql.Dataset;
2 +import org.apache.spark.sql.Row;
3 +import org.apache.spark.sql.SparkSession;
4 +import org.apache.spark.sql.expressions.Window;
5 +import org.apache.spark.sql.expressions.WindowSpec;
6 +
7 +import static org.apache.spark.sql.functions.*;
8 +import static org.apache.spark.sql.functions.lit;
9 +import static org.apache.spark.sql.functions.when;
10 +
11 +public class Aggregation {
12 +
13 + public static void main(String[] args) throws Exception {
14 +
15 + //Create Session
16 + SparkSession spark = SparkSession
17 + .builder()
18 + .appName("Detecting Fraud Clicks")
19 + .master("local")
20 + .getOrCreate();
21 +
22 + // Aggregation
23 + Aggregation agg = new Aggregation();
24 +
25 + Dataset<Row> dataset = agg.loadCSVDataSet("./train_sample.csv", spark);
26 + dataset = agg.changeTimestempToLong(dataset);
27 + dataset = agg.averageValidClickCount(dataset);
28 + dataset = agg.clickTimeDelta(dataset);
29 + dataset = agg.countClickInTenMinutes(dataset);
30 +
31 + //test
32 + dataset.where("ip == '5348' and app == '19'").show(10);
33 + }
34 +
35 +
36 + private Dataset<Row> loadCSVDataSet(String path, SparkSession spark){
37 + // Read SCV to DataSet
38 + return spark.read().format("csv")
39 + .option("inferSchema", "true")
40 + .option("header", "true")
41 + .load(path);
42 + }
43 +
44 + private Dataset<Row> changeTimestempToLong(Dataset<Row> dataset){
45 + // cast timestamp to long
46 + Dataset<Row> newDF = dataset.withColumn("utc_click_time", dataset.col("click_time").cast("long"));
47 + newDF = newDF.withColumn("utc_attributed_time", dataset.col("attributed_time").cast("long"));
48 + newDF = newDF.drop("click_time").drop("attributed_time");
49 + return newDF;
50 + }
51 +
52 + private Dataset<Row> averageValidClickCount(Dataset<Row> dataset){
53 + // set Window partition by 'ip' and 'app' order by 'utc_click_time' select rows between 1st row to current row
54 + WindowSpec w = Window.partitionBy("ip", "app")
55 + .orderBy("utc_click_time")
56 + .rowsBetween(Window.unboundedPreceding(), Window.currentRow());
57 +
58 + // aggregation
59 + Dataset<Row> newDF = dataset.withColumn("cum_count_click", count("utc_click_time").over(w));
60 + newDF = newDF.withColumn("cum_sum_attributed", sum("is_attributed").over(w));
61 + newDF = newDF.withColumn("avg_valid_click_count", col("cum_sum_attributed").divide(col("cum_count_click")));
62 + newDF = newDF.drop("cum_count_click", "cum_sum_attributed");
63 + return newDF;
64 + }
65 +
66 + private Dataset<Row> clickTimeDelta(Dataset<Row> dataset){
67 + WindowSpec w = Window.partitionBy ("ip")
68 + .orderBy("utc_click_time");
69 +
70 + Dataset<Row> newDF = dataset.withColumn("lag(utc_click_time)", lag("utc_click_time",1).over(w));
71 + newDF = newDF.withColumn("click_time_delta", when(col("lag(utc_click_time)").isNull(),
72 + lit(0)).otherwise(col("utc_click_time")).minus(when(col("lag(utc_click_time)").isNull(),
73 + lit(0)).otherwise(col("lag(utc_click_time)"))));
74 + newDF = newDF.drop("lag(utc_click_time)");
75 + return newDF;
76 + }
77 +
78 + private Dataset<Row> countClickInTenMinutes(Dataset<Row> dataset){
79 + WindowSpec w = Window.partitionBy("ip")
80 + .orderBy("utc_click_time")
81 + .rangeBetween(Window.currentRow(),Window.currentRow()+600);
82 +
83 + Dataset<Row> newDF = dataset.withColumn("count_click_in_ten_mins",
84 + (count("utc_click_time").over(w)).minus(1)); //TODO 본인것 포함할 것인지 정해야함.
85 + return newDF;
86 + }
87 +}
1 -import java.text.ParseException;
2 -import java.text.SimpleDateFormat;
3 -import java.util.Calendar;
4 -
5 -/**
6 - * Calendar 객체 관련 기능들을 모아놓은 유틸리티 클래스
7 - *
8 - * @author croute
9 - * @since 2011.02.10
10 - */
11 -public class DateUtil
12 -{
13 -
14 - /**
15 - * 캘린더 객체를 yyyy-MM-dd HH:mm:ss 형태의 문자열로 변환합니다.
16 - *
17 - * @param cal 캘린더 객체
18 - * @return 변환된 문자열
19 - */
20 - public static String StringFromCalendar(Calendar cal)
21 - {
22 - // 날짜를 통신용 문자열로 변경
23 - SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
24 - return formatter.format(cal.getTime());
25 - }
26 -
27 - /**
28 - * 캘린더 객체를 yyyy-MM-dd형태의 문자열로 변환합니다.
29 - *
30 - * @param cal 캘린더 객체
31 - * @return 변환된 문자열
32 - */
33 - public static String StringSimpleFromCalendar(Calendar cal)
34 - {
35 - // 날짜를 통신용 문자열로 변경
36 - SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
37 - return formatter.format(cal.getTime());
38 - }
39 -
40 - /**
41 - * yyyy-MM-dd HH:mm:ss 형태의 문자열을 캘린더 객체로 변환합니다.
42 - * 만약 변환에 실패할 경우 오늘 날짜를 반환합니다.
43 - *
44 - * @param date 날짜를 나타내는 문자열
45 - * @return 변환된 캘린더 객체
46 - */
47 - public static Calendar CalendarFromString(String date)
48 - {
49 - if (date.length() == 0)
50 - return null;
51 - Calendar cal = Calendar.getInstance();
52 - try
53 - {
54 - //String oldstring = "2011-01-18 00:00:00.0";
55 - // Date date = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S").parse(oldstring);
56 - SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
57 - cal.setTime(formatter.parse(date));
58 - }
59 - catch(ParseException e)
60 - {
61 - e.printStackTrace();
62 - }
63 - return cal;
64 - }
65 -
66 - /**
67 - * yyyy-MM-dd 형태의 문자열을 캘린더 객체로 변환합니다.
68 - * 만약 변환에 실패할 경우 오늘 날짜를 반환합니다.
69 - *
70 - * @param date 날짜를 나타내는 문자열
71 - * @return 변환된 캘린더 객체
72 - */
73 - public static Calendar CalendarFromStringSimple(String date)
74 - {
75 - Calendar cal = Calendar.getInstance();
76 -
77 - try
78 - {
79 - SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
80 - cal.setTime(formatter.parse(date));
81 - }
82 - catch(ParseException e)
83 - {
84 - e.printStackTrace();
85 - }
86 - return cal;
87 - }
88 -}
...\ No newline at end of file ...\ No newline at end of file
...@@ -22,22 +22,6 @@ import java.util.*; ...@@ -22,22 +22,6 @@ import java.util.*;
22 22
23 // ml 23 // ml
24 24
25 -//ip,app,device,os,channel,click_time,attributed_time,is_attributed
26 -//87540,12,1,13,497,2017-11-07 09:30:38,,0
27 -
28 -class RecordComparator implements Comparator<Record> {
29 - @Override
30 - public int compare(Record v1 , Record v2) {
31 -// if(a.ano < b.ano) return -1;
32 -// else if(a.ano == b.ano) return 0;
33 -// else return 1;
34 - if (v1.ip.compareTo(v2.ip) == 0) {
35 - return v1.clickTime.compareTo(v2.clickTime);
36 - }
37 - return v1.ip.compareTo(v2.ip);
38 - }
39 -}
40 -
41 public class MapExample { 25 public class MapExample {
42 26
43 static SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("Cesco"); 27 static SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("Cesco");
...@@ -45,122 +29,10 @@ public class MapExample { ...@@ -45,122 +29,10 @@ public class MapExample {
45 static SQLContext sqlContext = new SQLContext(sc); 29 static SQLContext sqlContext = new SQLContext(sc);
46 30
47 public static void main(String[] args) throws Exception { 31 public static void main(String[] args) throws Exception {
48 - JavaRDD<String> file = sc.textFile("data/train.csv", 1);
49 -
50 - final String header = file.first();
51 - JavaRDD<String> data = file.filter(line -> !line.equalsIgnoreCase(header));
52 -
53 - JavaRDD<Record> records = data.map(line -> {
54 - String[] fields = line.split(",");
55 - Record sd = new Record(Integer.parseInt(fields[0]), Integer.parseInt(fields[1]), Integer.parseInt(fields[2]), Integer.parseInt(fields[3]), Integer.parseInt(fields[4]), fields[5], fields[6], Integer.parseInt(fields[7].trim()));
56 - return sd;
57 - });
58 -
59 -// JavaRDD<Tuple4<Integer,Double,Long,Integer>> secondSortRDD = firstSortRDD.keyBy(new Function<Tuple4<Integer, Double, Long, Integer>, Tuple2<Double, Long>>(){
60 -// @Override
61 -// public Tuple2<Double, Long> call(Tuple4<Integer, Double, Long, Integer> value) throws Exception {
62 -// return new Tuple2(value._2(),value._3());
63 -// }}).sortByKey(new TupleComparator()).values();
64 -
65 - JavaRDD<Record> firstSorted = records.sortBy(new Function<Record, String>() {
66 - @Override
67 - public String call(Record record) throws Exception {
68 - return record.clickTime;
69 - }
70 - }, true, 1);
71 -
72 - JavaRDD<Record> sortedRecords = firstSorted.sortBy(new Function<Record, Integer>() {
73 - @Override
74 - public Integer call(Record record) throws Exception {
75 - return record.ip.intValue();
76 - }
77 - }, true, 1);
78 -
79 -
80 - /*
81 - //두개를 한번에 정렬해보려 했지만 실패
82 - JavaRDD<Record> sortedRecords = records.keyBy(new Function<Record, Record>(){
83 - @Override
84 - public Record call(Record record) throws Exception {
85 - return new Record(record.ip, record.app, record.device, record.os, record.channel, record.clickTime, record.attributedTime, record.isAttributed);
86 - }}).sortByKey(new RecordComparator()).values();
87 - */
88 -
89 -// System.out.println("sortedRecords");
90 -// sortedRecords.foreach(record -> {System.out.println(record.ip + " " + record.clickTime.getTime());});
91 -
92 -// System.out.println("make result");
93 - /*
94 - //map의 다음것을 가져오려했지만 실패
95 - JavaRDD<Record> result = sortedRecords.map(record -> {
96 - System.out.println("make addTen");
97 - Calendar addTen = Calendar.getInstance();
98 - addTen.setTime(record.clickTime.getTime());
99 - addTen.add(Calendar.MINUTE, 10);
100 -
101 - System.out.println("make count");
102 - int count = 0;
103 - for (Record temp: sortedRecords.collect()) {
104 - if (temp.ip.compareTo(record.ip) == 0 && temp.clickTime.compareTo(record.clickTime) > 0 && temp.clickTime.compareTo(addTen)< 0)
105 - count++;
106 - }
107 -
108 - return new Record(record.ip, record.app, record.device, record.os, record.channel, record.clickTime, record.attributedTime, record.isAttributed, count);
109 - });
110 - */
111 -// System.out.println("result");
112 -// result.foreach(record -> {System.out.println(record.ip + " " + record.clickTime.getTime());});
113 -
114 - /*
115 -
116 - for (final ListIterator<String> it = list.listIterator(); it.hasNext();) {
117 - final String s = it.next();
118 - System.out.println(it.previousIndex() + ": " + s);
119 - }
120 -
121 - for (ListIterator<Record> it = sortedRecords.collect().listIterator(); it.hasNext(); it = it.nextIndex()) {
122 - it.
123 - if (temp.ip.compareTo(record.ip) == 0 && temp.clickTime.compareTo(record.clickTime) > 0 && temp.clickTime.compareTo(addTen)< 0)
124 - count++;
125 - }
126 - */
127 -
128 -
129 - List<Record> list = sortedRecords.collect();
130 -
131 - List<Record> resultList = new ArrayList<Record>();
132 - for (int i = 0; i < list.size(); i++) {
133 - //System.out.println(list.get(i).ip);
134 -
135 - Record record = list.get(i);
136 -
137 - Calendar recordI = DateUtil.CalendarFromString(record.clickTime);
138 -
139 - Calendar addTen = Calendar.getInstance();
140 - addTen.setTime(recordI.getTime());
141 - addTen.add(Calendar.MINUTE, 10);
142 -
143 - int count = 0;
144 -
145 - for (int j = i+1; j < list.size() && list.get(j).ip.compareTo(record.ip) == 0; j++) {
146 - Calendar recordJ = DateUtil.CalendarFromString(list.get(j).clickTime);
147 - if (recordJ.compareTo(recordI) > 0 && recordJ.compareTo(addTen) < 0) {
148 - count++;
149 - } else {
150 - break;
151 - }
152 - }
153 -
154 - resultList.add(new Record(record.ip, record.app, record.device, record.os, record.channel, record.clickTime, record.attributedTime, record.isAttributed, count));
155 -
156 - }
157 -
158 - JavaRDD<Record> result = sc.parallelize(resultList);
159 -// result.foreach(record -> {System.out.println(record.ip + " " + record.clickTime.getTime() + " " + record.clickInTenMins);});
160 32
161 // Automatically identify categorical features, and index them. 33 // Automatically identify categorical features, and index them.
162 // Set maxCategories so features with > 4 distinct values are treated as continuous. 34 // Set maxCategories so features with > 4 distinct values are treated as continuous.
163 - Dataset<Row> resultds = sqlContext.createDataFrame(result, Record.class); 35 + Dataset<Row> resultds = sqlContext.createDataFrame(result);
164 36
165 System.out.println("schema start"); 37 System.out.println("schema start");
166 resultds.printSchema(); 38 resultds.printSchema();
......
1 -import scala.Serializable;
2 -
3 -public class Record implements Serializable {
4 - Integer ip;
5 - Integer app;
6 - Integer device;
7 - Integer os;
8 - Integer channel;
9 - String clickTime;
10 - String attributedTime;
11 - Integer isAttributed;
12 - Integer clickInTenMins;
13 -
14 - // constructor , getters and setters
15 - public Record(int pIp, int pApp, int pDevice, int pOs, int pChannel, String pClickTime, String pAttributedTime, Integer pIsAttributed) {
16 - ip = new Integer(pIp);
17 - app = new Integer(pApp);
18 - device = new Integer(pDevice);
19 - os = new Integer(pOs);
20 - channel = new Integer(pChannel);
21 - clickTime = pClickTime;
22 - attributedTime = pAttributedTime;
23 - isAttributed = new Integer(pIsAttributed);
24 - clickInTenMins = new Integer(0);
25 - }
26 -
27 - public Record(int pIp, int pApp, int pDevice, int pOs, int pChannel, String pClickTime, String pAttributedTime, Integer pIsAttributed, int pClickInTenMins) {
28 - ip = new Integer(pIp);
29 - app = new Integer(pApp);
30 - device = new Integer(pDevice);
31 - os = new Integer(pOs);
32 - channel = new Integer(pChannel);
33 - clickTime = pClickTime;
34 - attributedTime = pAttributedTime;
35 - isAttributed = new Integer(pIsAttributed);
36 - clickInTenMins = new Integer(pClickInTenMins);
37 - }
38 -
39 - public Integer getIp() {
40 - return ip;
41 - }
42 -
43 - public void setIp(Integer ip) {
44 - this.ip = ip;
45 - }
46 -
47 - public Integer getApp() {
48 - return app;
49 - }
50 -
51 - public void setApp(Integer app) {
52 - this.app = app;
53 - }
54 -
55 - public Integer getDevice() {
56 - return device;
57 - }
58 -
59 - public void setDevice(Integer device) {
60 - this.device = device;
61 - }
62 -
63 - public Integer getOs() {
64 - return os;
65 - }
66 -
67 - public void setOs(Integer os) {
68 - this.os = os;
69 - }
70 -
71 - public Integer getChannel() {
72 - return channel;
73 - }
74 -
75 - public void setChannel(Integer channel) {
76 - this.channel = channel;
77 - }
78 -
79 - public String getClickTime() {
80 - return clickTime;
81 - }
82 -
83 - public void setClickTime(String clickTime) {
84 - this.clickTime = clickTime;
85 - }
86 -
87 - public String getAttributedTime() {
88 - return attributedTime;
89 - }
90 -
91 - public void setAttributedTime(String attributedTime) {
92 - this.attributedTime = attributedTime;
93 - }
94 -
95 - public Integer getAttributed() {
96 - return isAttributed;
97 - }
98 -
99 - public void setAttributed(Integer attributed) {
100 - isAttributed = attributed;
101 - }
102 -
103 - public Integer getClickInTenMins() {
104 - return clickInTenMins;
105 - }
106 -
107 - public void setClickInTenMins(Integer clickInTenMins) {
108 - this.clickInTenMins = clickInTenMins;
109 - }
110 -}
...\ No newline at end of file ...\ No newline at end of file
1 -public class valid {
2 - private int x;
3 -
4 - valid() {
5 - x = 0;
6 - }
7 -
8 - void printX(){
9 - System.out.println(x);
10 - }
11 -
12 - public static void main(String[] args){
13 - valid v = new valid();
14 - v.printX();
15 - }
16 -
17 -}
File mode changed