Showing
3 changed files
with
275 additions
and
62 deletions
src/main/java/Aggregation.java
0 → 100644
1 | +import org.apache.spark.api.java.function.MapFunction; | ||
2 | +import org.apache.spark.sql.Dataset; | ||
3 | +import org.apache.spark.sql.Encoders; | ||
4 | +import org.apache.spark.sql.Row; | ||
5 | +import org.apache.spark.sql.SparkSession; | ||
6 | +import org.apache.spark.sql.expressions.Window; | ||
7 | +import org.apache.spark.sql.expressions.WindowSpec; | ||
8 | + | ||
9 | +import java.util.ArrayList; | ||
10 | +import java.util.List; | ||
11 | + | ||
12 | +import static org.apache.spark.sql.functions.*; | ||
13 | +import static org.apache.spark.sql.functions.lit; | ||
14 | +import static org.apache.spark.sql.functions.when; | ||
15 | + | ||
16 | +public class Aggregation { | ||
17 | + | ||
18 | + public static void main(String[] args) throws Exception { | ||
19 | + | ||
20 | + //Create Session | ||
21 | + SparkSession spark = SparkSession | ||
22 | + .builder() | ||
23 | + .appName("Detecting Fraud Clicks") | ||
24 | + .master("local") | ||
25 | + .getOrCreate(); | ||
26 | + | ||
27 | + // Aggregation | ||
28 | + Aggregation agg = new Aggregation(); | ||
29 | + | ||
30 | + Dataset<Row> dataset = agg.loadCSVDataSet("/home/chris/.kaggle/competitions/talkingdata-adtracking-fraud-detection/mnt/ssd/kaggle-talkingdata2/competition_files/train_sample.csv", spark); | ||
31 | + dataset = agg.changeTimestempToLong(dataset); | ||
32 | + dataset = agg.averageValidClickCount(dataset); | ||
33 | + dataset = agg.clickTimeDelta(dataset); | ||
34 | + dataset = agg.countClickInTenMinutes(dataset); | ||
35 | + | ||
36 | + long start = System.currentTimeMillis(); | ||
37 | + | ||
38 | + List<String> logs_with_features = dataset.map(row->row.toString(), Encoders.STRING()).collectAsList(); | ||
39 | + String[][] contents = new String[(int)dataset.count()][11]; | ||
40 | + for (int i =0; i<logs_with_features.size();i++){ | ||
41 | + String str_to_split = logs_with_features.get(i); | ||
42 | + String[] tmp = str_to_split.substring(1,str_to_split.length()-1).split(","); | ||
43 | + contents[i] = tmp; | ||
44 | + } | ||
45 | + | ||
46 | + long end = System.currentTimeMillis(); | ||
47 | + System.out.println("JK's Procedure time elapsed : " + (end-start)/1000.0); | ||
48 | + | ||
49 | + start = System.currentTimeMillis(); | ||
50 | + List<String> stringDataset = dataset.toJSON().collectAsList(); | ||
51 | + end = System.currentTimeMillis(); | ||
52 | + System.out.println("Steve's Procedure 1 time elapsed : " + (end-start)/1000.0); | ||
53 | + new GUI(stringDataset, contents); | ||
54 | + | ||
55 | + | ||
56 | + } | ||
57 | + | ||
58 | + | ||
59 | + private Dataset<Row> loadCSVDataSet(String path, SparkSession spark){ | ||
60 | + // Read SCV to DataSet | ||
61 | + return spark.read().format("csv") | ||
62 | + .option("inferSchema", "true") | ||
63 | + .option("header", "true") | ||
64 | + .load(path); | ||
65 | + } | ||
66 | + | ||
67 | + private Dataset<Row> changeTimestempToLong(Dataset<Row> dataset){ | ||
68 | + // cast timestamp to long | ||
69 | + Dataset<Row> newDF = dataset.withColumn("utc_click_time", dataset.col("click_time").cast("long")); | ||
70 | + newDF = newDF.withColumn("utc_attributed_time", dataset.col("attributed_time").cast("long")); | ||
71 | + newDF = newDF.drop("click_time").drop("attributed_time"); | ||
72 | + return newDF; | ||
73 | + } | ||
74 | + | ||
75 | + private Dataset<Row> averageValidClickCount(Dataset<Row> dataset){ | ||
76 | + // set Window partition by 'ip' and 'app' order by 'utc_click_time' select rows between 1st row to current row | ||
77 | + WindowSpec w = Window.partitionBy("ip", "app") | ||
78 | + .orderBy("utc_click_time") | ||
79 | + .rowsBetween(Window.unboundedPreceding(), Window.currentRow()); | ||
80 | + | ||
81 | + // aggregation | ||
82 | + Dataset<Row> newDF = dataset.withColumn("cum_count_click", count("utc_click_time").over(w)); | ||
83 | + newDF = newDF.withColumn("cum_sum_attributed", sum("is_attributed").over(w)); | ||
84 | + newDF = newDF.withColumn("avg_valid_click_count", col("cum_sum_attributed").divide(col("cum_count_click"))); | ||
85 | + newDF = newDF.drop("cum_count_click", "cum_sum_attributed"); | ||
86 | + return newDF; | ||
87 | + } | ||
88 | + | ||
89 | + private Dataset<Row> clickTimeDelta(Dataset<Row> dataset){ | ||
90 | + WindowSpec w = Window.partitionBy ("ip") | ||
91 | + .orderBy("utc_click_time"); | ||
92 | + | ||
93 | + Dataset<Row> newDF = dataset.withColumn("lag(utc_click_time)", lag("utc_click_time",1).over(w)); | ||
94 | + newDF = newDF.withColumn("click_time_delta", when(col("lag(utc_click_time)").isNull(), | ||
95 | + lit(0)).otherwise(col("utc_click_time")).minus(when(col("lag(utc_click_time)").isNull(), | ||
96 | + lit(0)).otherwise(col("lag(utc_click_time)")))); | ||
97 | + newDF = newDF.drop("lag(utc_click_time)"); | ||
98 | + return newDF; | ||
99 | + } | ||
100 | + | ||
101 | + private Dataset<Row> countClickInTenMinutes(Dataset<Row> dataset){ | ||
102 | + WindowSpec w = Window.partitionBy("ip") | ||
103 | + .orderBy("utc_click_time") | ||
104 | + .rangeBetween(Window.currentRow(),Window.currentRow()+600); | ||
105 | + | ||
106 | + Dataset<Row> newDF = dataset.withColumn("count_click_in_ten_mins", | ||
107 | + (count("utc_click_time").over(w)).minus(1)); //TODO 본인것 포함할 것인지 정해야함. | ||
108 | + return newDF; | ||
109 | + } | ||
110 | +} | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
src/main/java/GUI.java
0 → 100644
1 | +import org.apache.spark.sql.Dataset; | ||
2 | +import org.apache.spark.sql.Row; | ||
3 | + | ||
4 | +import javax.swing.*; | ||
5 | +import java.awt.*; | ||
6 | +import java.io.BufferedReader; | ||
7 | +import java.io.StringReader; | ||
8 | +import java.sql.ResultSet; | ||
9 | +import java.sql.ResultSetMetaData; | ||
10 | +import java.sql.Statement; | ||
11 | +import java.util.List; | ||
12 | +import java.util.Vector; | ||
13 | +import java.awt.BorderLayout; | ||
14 | +import java.awt.GridLayout; | ||
15 | +import java.awt.event.ActionEvent; | ||
16 | +import java.awt.event.ActionListener; | ||
17 | +import java.sql.Connection; | ||
18 | +import java.sql.DriverManager; | ||
19 | +import java.sql.ResultSet; | ||
20 | +import java.sql.ResultSetMetaData; | ||
21 | +import java.sql.Statement; | ||
22 | +import java.util.Vector; | ||
23 | + | ||
24 | +import javax.swing.JButton; | ||
25 | +import javax.swing.JFrame; | ||
26 | +import javax.swing.JLabel; | ||
27 | +import javax.swing.JPanel; | ||
28 | +import javax.swing.JScrollPane; | ||
29 | +import javax.swing.JTable; | ||
30 | +import javax.swing.JTextField; | ||
31 | +import javax.swing.table.AbstractTableModel; | ||
32 | +import javax.swing.table.DefaultTableModel; | ||
33 | + | ||
34 | +public class GUI extends JFrame { | ||
35 | + JTabbedPane tab = new JTabbedPane(); | ||
36 | + | ||
37 | + public GUI(List<String> q, String[][] data) { | ||
38 | + super("CESCO"); | ||
39 | + | ||
40 | + tab.addTab("png", new PngPane()); | ||
41 | + tab.addTab("gif", new GifPane()); | ||
42 | + tab.addTab("jpg", new JpgPane()); | ||
43 | + tab.addTab("table", new createTable(q)); | ||
44 | + tab.addTab("processed_features", new createTable_alter(data)); | ||
45 | + | ||
46 | + add(tab); | ||
47 | + | ||
48 | + setSize(800, 500); // 윈도우의 크기 가로x세로 | ||
49 | + setVisible(true); // 창을 보여줄떄 true, 숨길때 false | ||
50 | + setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); // x 버튼을 눌렀을때 종료 | ||
51 | + } | ||
52 | + | ||
53 | +// public static void main(String args[]) { | ||
54 | +// new GUI(); | ||
55 | +// } | ||
56 | +} | ||
57 | + | ||
58 | +class PngPane extends JPanel { | ||
59 | + public PngPane() { | ||
60 | + super(); | ||
61 | + ImageIcon image = new ImageIcon("data/model.png"); | ||
62 | + JLabel label = new JLabel("", image, JLabel.CENTER); | ||
63 | + setLayout(new BorderLayout()); | ||
64 | + add(label, BorderLayout.CENTER); | ||
65 | + } | ||
66 | +} | ||
67 | + | ||
68 | +class GifPane extends JPanel { | ||
69 | + public GifPane() { | ||
70 | + super(); | ||
71 | + ImageIcon image = new ImageIcon("data/model.gif"); | ||
72 | + JLabel label = new JLabel("", image, JLabel.CENTER); | ||
73 | + setLayout(new BorderLayout()); | ||
74 | + add(label, BorderLayout.CENTER); | ||
75 | + } | ||
76 | +} | ||
77 | + | ||
78 | +class JpgPane extends JPanel { | ||
79 | + public JpgPane() { | ||
80 | + super(); | ||
81 | + ImageIcon image = new ImageIcon("data/model.jpg"); | ||
82 | + JLabel label = new JLabel("", image, JLabel.CENTER); | ||
83 | + setLayout(new BorderLayout()); | ||
84 | + add(label, BorderLayout.CENTER); | ||
85 | + } | ||
86 | +} | ||
87 | + | ||
88 | +class createTable_alter extends JPanel{ | ||
89 | + private String[] header = {"ip","app","device","os","channel","is_attributed","click_time","attributed_time", | ||
90 | + "avg_valid_click_count","click_time_delta","count_click_in_tenmin"}; | ||
91 | +/* | ||
92 | +root | ||
93 | + |-- ip: integer (nullable = true) | ||
94 | + |-- app: integer (nullable = true) | ||
95 | + |-- device: integer (nullable = true) | ||
96 | + |-- os: integer (nullable = true) | ||
97 | + |-- channel: integer (nullable = true) | ||
98 | + |-- is_attributed: integer (nullable = true) | ||
99 | + |-- utc_click_time: long (nullable = true) | ||
100 | + |-- utc_attributed_time: long (nullable = true) | ||
101 | + |-- avg_valid_click_count: double (nullable = true) | ||
102 | + |-- click_time_delta: long (nullable = true) | ||
103 | + |-- count_click_in_ten_mins: long (nullable = false) | ||
104 | + */ | ||
105 | + public createTable_alter(String[][] data){ | ||
106 | + JTable processed_table = new JTable(data, header); | ||
107 | + JScrollPane jScrollPane = new JScrollPane(processed_table); | ||
108 | + add(jScrollPane); | ||
109 | + } | ||
110 | +} | ||
111 | + | ||
112 | +class createTable extends JPanel { | ||
113 | + long start = System.currentTimeMillis(); | ||
114 | + public createTable(List<String> data) { //constructor : display table | ||
115 | + getTableModel(data); | ||
116 | + } | ||
117 | + | ||
118 | + private DefaultTableModel getTableModel(List<String> data) { | ||
119 | + String column_n[]={"ip","app","device","os","channel","is_attributed","click_time", | ||
120 | + "avg_valid_click_count","click_time_delta","count_click_in_tenmin"}; | ||
121 | + Object tabledata[][]={}; | ||
122 | + DefaultTableModel model = new DefaultTableModel(tabledata,column_n); | ||
123 | + JTable jtable = new JTable(model); | ||
124 | + JScrollPane jScollPane = new JScrollPane(jtable); | ||
125 | + add(jScollPane); | ||
126 | + try { | ||
127 | + for(int i =0; i<data.size();i++){ | ||
128 | + BufferedReader reader = getFileReader(data.get(i)); | ||
129 | + String line = reader.readLine(); | ||
130 | + | ||
131 | + | ||
132 | + line = line.replace("\"", ""); | ||
133 | + line = line.replace("_", ""); | ||
134 | + //line = line.replace("\\{",""); | ||
135 | + line = line.replaceAll("\\{|\\}",""); | ||
136 | + line = line.replaceAll("\\w+:", ""); | ||
137 | + | ||
138 | + //System.out.println(line); | ||
139 | + Object [] temp= line.split(","); | ||
140 | + | ||
141 | + model.addRow(temp); | ||
142 | + | ||
143 | + reader.close(); | ||
144 | + } | ||
145 | + | ||
146 | + } catch (Exception e) { | ||
147 | + System.out.println(e); | ||
148 | + } | ||
149 | + long end = System.currentTimeMillis(); | ||
150 | + System.out.println("Steve's Procedure2 time elapsed : " + (end-start)/1000.0); | ||
151 | + | ||
152 | + return model; | ||
153 | + } | ||
154 | + | ||
155 | + private BufferedReader getFileReader(String data) { | ||
156 | + | ||
157 | + BufferedReader reader = new BufferedReader(new StringReader(data)); | ||
158 | + | ||
159 | + // In your real application the data would come from a file | ||
160 | + | ||
161 | + //Reader reader = new BufferedReader( new FileReader(...) ); | ||
162 | + | ||
163 | + return reader; | ||
164 | + } | ||
165 | +} | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
1 | -import org.apache.spark.SparkConf; | ||
2 | -import org.apache.spark.api.java.JavaSparkContext; | ||
3 | -import org.apache.spark.sql.Dataset; | ||
4 | -import org.apache.spark.sql.Row; | ||
5 | -import org.apache.spark.sql.SparkSession; | ||
6 | -import org.apache.spark.sql.expressions.Window; | ||
7 | -import org.apache.spark.sql.expressions.WindowSpec; | ||
8 | - | ||
9 | -import javax.xml.crypto.Data; | ||
10 | - | ||
11 | -import static org.apache.spark.sql.functions.*; | ||
12 | - | ||
13 | -public class calForwardTimeDelta { | ||
14 | - static SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("Cesco"); | ||
15 | - static JavaSparkContext sc = new JavaSparkContext(conf); | ||
16 | - | ||
17 | - public static void main(String[] args) throws Exception{ | ||
18 | - //Create Session | ||
19 | - SparkSession spark = SparkSession | ||
20 | - .builder() | ||
21 | - .appName("Detecting Fraud Clicks") | ||
22 | - .getOrCreate(); | ||
23 | - | ||
24 | - //run methods here | ||
25 | - calcDelta(spark); | ||
26 | - } | ||
27 | - | ||
28 | - private static void calcDelta(SparkSession spark){ | ||
29 | - // put the path the file you gonna deal with being placed | ||
30 | - String filepath = "/home/chris/.kaggle/competitions/talkingdata-adtracking-fraud-detection/mnt/ssd/kaggle-talkingdata2/competition_files/train_sample.csv"; | ||
31 | - | ||
32 | - // create Dataset from files | ||
33 | - Dataset<Row> logDF = spark.read() | ||
34 | - .format("csv") | ||
35 | - .option("inferSchema", "true") | ||
36 | - .option("header","true") | ||
37 | - .load(filepath); | ||
38 | - | ||
39 | - // cast timestamp(click_time, attributed_time) type to long type | ||
40 | - | ||
41 | - //add column for long(click_time) | ||
42 | - Dataset<Row> newDF = logDF.withColumn("utc_click_time", logDF.col("click_time").cast("long")); | ||
43 | - //add column for long(attributed_time) | ||
44 | - newDF = newDF.withColumn("utc_attributed_time", logDF.col("attributed_time").cast("long")); | ||
45 | - //drop timestamp type columns | ||
46 | - newDF = newDF.drop("click_time").drop("attributed_time"); | ||
47 | - newDF.createOrReplaceTempView("logs"); | ||
48 | - | ||
49 | - WindowSpec w = Window.partitionBy ("ip") | ||
50 | - .orderBy("utc_click_time"); | ||
51 | - | ||
52 | - newDF = newDF.withColumn("lag(utc_click_time)", lag("utc_click_time",1).over(w)); | ||
53 | - newDF.where("ip=10").show(); | ||
54 | - newDF = newDF.withColumn("delta", when(col("lag(utc_click_time)").isNull(),lit(0)).otherwise(col("utc_click_time")).minus(when(col("lag(utc_click_time)").isNull(),lit(0)).otherwise(col("lag(utc_click_time)")))); | ||
55 | - //newDF = newDF.withColumn("delta", datediff()); | ||
56 | - newDF = newDF.drop("lag(utc_click_time)"); | ||
57 | - newDF = newDF.orderBy("ip"); | ||
58 | - | ||
59 | - newDF.show(); | ||
60 | - } | ||
61 | - | ||
62 | -} |
-
Please register or login to post a comment