hyungyun.Moon

read data

1 <?xml version="1.0" encoding="UTF-8"?> 1 <?xml version="1.0" encoding="UTF-8"?>
2 -<module type="JAVA_MODULE" version="4">
3 - <component name="NewModuleRootManager" inherit-compiler-output="true">
4 - <exclude-output />
5 - <content url="file://$MODULE_DIR$" />
6 - <orderEntry type="inheritedJdk" />
7 - <orderEntry type="sourceFolder" forTests="false" />
8 - </component>
9 -</module>
...\ No newline at end of file ...\ No newline at end of file
2 +<module type="JAVA_MODULE" version="4" />
...\ No newline at end of file ...\ No newline at end of file
......
1 -<?xml version="1.0" encoding="UTF-8"?>
2 -<project version="4">
3 - <component name="ProjectModuleManager">
4 - <modules>
5 - <module fileurl="file://$PROJECT_DIR$/.idea/Detecting_fraud_clicks.iml" filepath="$PROJECT_DIR$/.idea/Detecting_fraud_clicks.iml" />
6 - </modules>
7 - </component>
8 -</project>
...\ No newline at end of file ...\ No newline at end of file
...@@ -16,6 +16,11 @@ ...@@ -16,6 +16,11 @@
16 <artifactId>spark-core_2.11</artifactId> 16 <artifactId>spark-core_2.11</artifactId>
17 <version>2.3.0</version> 17 <version>2.3.0</version>
18 </dependency> 18 </dependency>
19 + <dependency>
20 + <groupId>org.apache.spark</groupId>
21 + <artifactId>spark-sql_2.11</artifactId>
22 + <version>2.2.0</version>
23 + </dependency>
19 24
20 </dependencies> 25 </dependencies>
21 26
......
1 +import java.text.ParseException;
2 +import java.text.SimpleDateFormat;
3 +import java.util.Calendar;
4 +
5 +/**
6 + * Calendar 객체 관련 기능들을 모아놓은 유틸리티 클래스
7 + *
8 + * @author croute
9 + * @since 2011.02.10
10 + */
11 +public class DateUtil
12 +{
13 +
14 + /**
15 + * 캘린더 객체를 yyyy-MM-dd HH:mm:ss 형태의 문자열로 변환합니다.
16 + *
17 + * @param cal 캘린더 객체
18 + * @return 변환된 문자열
19 + */
20 + public static String StringFromCalendar(Calendar cal)
21 + {
22 + // 날짜를 통신용 문자열로 변경
23 + SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
24 + return formatter.format(cal.getTime());
25 + }
26 +
27 + /**
28 + * 캘린더 객체를 yyyy-MM-dd형태의 문자열로 변환합니다.
29 + *
30 + * @param cal 캘린더 객체
31 + * @return 변환된 문자열
32 + */
33 + public static String StringSimpleFromCalendar(Calendar cal)
34 + {
35 + // 날짜를 통신용 문자열로 변경
36 + SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
37 + return formatter.format(cal.getTime());
38 + }
39 +
40 + /**
41 + * yyyy-MM-dd HH:mm:ss 형태의 문자열을 캘린더 객체로 변환합니다.
42 + * 만약 변환에 실패할 경우 오늘 날짜를 반환합니다.
43 + *
44 + * @param date 날짜를 나타내는 문자열
45 + * @return 변환된 캘린더 객체
46 + */
47 + public static Calendar CalendarFromString(String date)
48 + {
49 + if (date.length() == 0)
50 + return null;
51 + Calendar cal = Calendar.getInstance();
52 + try
53 + {
54 + //String oldstring = "2011-01-18 00:00:00.0";
55 + // Date date = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S").parse(oldstring);
56 + SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
57 + cal.setTime(formatter.parse(date));
58 + }
59 + catch(ParseException e)
60 + {
61 + e.printStackTrace();
62 + }
63 + return cal;
64 + }
65 +
66 + /**
67 + * yyyy-MM-dd 형태의 문자열을 캘린더 객체로 변환합니다.
68 + * 만약 변환에 실패할 경우 오늘 날짜를 반환합니다.
69 + *
70 + * @param date 날짜를 나타내는 문자열
71 + * @return 변환된 캘린더 객체
72 + */
73 + public static Calendar CalendarFromStringSimple(String date)
74 + {
75 + Calendar cal = Calendar.getInstance();
76 +
77 + try
78 + {
79 + SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
80 + cal.setTime(formatter.parse(date));
81 + }
82 + catch(ParseException e)
83 + {
84 + e.printStackTrace();
85 + }
86 + return cal;
87 + }
88 +}
...\ No newline at end of file ...\ No newline at end of file
1 import org.apache.spark.SparkConf; 1 import org.apache.spark.SparkConf;
2 import org.apache.spark.api.java.JavaRDD; 2 import org.apache.spark.api.java.JavaRDD;
3 import org.apache.spark.api.java.JavaSparkContext; 3 import org.apache.spark.api.java.JavaSparkContext;
4 +import org.apache.spark.api.java.function.Function;
5 +import org.apache.spark.sql.Dataset;
6 +import org.apache.spark.sql.Row;
7 +import org.apache.spark.sql.SQLContext;
8 +import org.apache.spark.sql.SparkSession;
9 +import org.apache.spark.sql.types.StructType;
10 +import scala.Serializable;
4 import scala.Tuple2; 11 import scala.Tuple2;
5 12
6 import java.util.Arrays; 13 import java.util.Arrays;
14 +import java.util.Calendar;
7 import java.util.List; 15 import java.util.List;
8 16
17 +//ip,app,device,os,channel,click_time,attributed_time,is_attributed
18 +//87540,12,1,13,497,2017-11-07 09:30:38,,0
19 +class Record implements Serializable {
20 + int ip;
21 + int app;
22 + int device;
23 + int os;
24 + int channel;
25 + Calendar clickTime;
26 + Calendar attributedTime;
27 + boolean isAttributed;
28 +
29 + // constructor , getters and setters
30 + public Record(int pIp, int pApp, int pDevice, int pOs, int pChannel, Calendar pClickTime, Calendar pAttributedTime, boolean pIsAttributed) {
31 + ip = pIp;
32 + app = pApp;
33 + device = pDevice;
34 + os = pOs;
35 + channel = pChannel;
36 + clickTime = pClickTime;
37 + attributedTime = pAttributedTime;
38 + isAttributed = pIsAttributed;
39 + }
40 +}
41 +
9 public class MapExample { 42 public class MapExample {
10 43
11 static SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("Cesco"); 44 static SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("Cesco");
12 static JavaSparkContext sc = new JavaSparkContext(conf); 45 static JavaSparkContext sc = new JavaSparkContext(conf);
46 + static SQLContext sqlContext = new SQLContext(sc);
13 47
14 public static void main(String[] args) throws Exception { 48 public static void main(String[] args) throws Exception {
49 + JavaRDD<String> file = sc.textFile("/Users/hyeongyunmun/Dropbox/DetectFraudClick/data/train.csv");
15 50
16 - // Parallelized with 2 partitions 51 + final String header = file.first();
17 - JavaRDD<String> x = sc.parallelize( 52 + JavaRDD<String> data = file.filter(line -> !line.equalsIgnoreCase(header));
18 - Arrays.asList("spark", "rdd", "example", "sample", "example"),
19 - 2);
20 -
21 - // Word Count Map Example
22 - JavaRDD<Tuple2<String, Integer>> y1 = x.map(e -> new Tuple2<>(e, 1));
23 - List<Tuple2<String, Integer>> list1 = y1.collect();
24 -
25 - // Another example of making tuple with string and it's length
26 - JavaRDD<Tuple2<String, Integer>> y2 = x.map(e -> new Tuple2<>(e, e.length()));
27 - List<Tuple2<String, Integer>> list2 = y2.collect();
28 53
29 - System.out.println(list1); 54 + JavaRDD<Record> records = data.map((line) -> {
55 + String[] fields = line.split(",");
56 + Record sd = new Record(Integer.parseInt(fields[0]), Integer.parseInt(fields[1]), Integer.parseInt(fields[2]), Integer.parseInt(fields[3]), Integer.parseInt(fields[4]), DateUtil.CalendarFromString(fields[5]), DateUtil.CalendarFromString(fields[6]), "1".equalsIgnoreCase(fields[7].trim()));
57 + return sd;
58 + });
30 } 59 }
31 } 60 }
......