tnt-ooo-tnt

Java-Cesco/Detecting-fraud-clicks#13-Draft

1 -import org.apache.spark.api.java.function.MapFunction;
2 -import org.apache.spark.sql.Dataset;
3 -import org.apache.spark.sql.Encoders;
4 -import org.apache.spark.sql.Row;
5 -import org.apache.spark.sql.SparkSession;
6 -import org.apache.spark.sql.expressions.Window;
7 -import org.apache.spark.sql.expressions.WindowSpec;
8 -
9 -import java.util.ArrayList;
10 -import java.util.List;
11 -
12 -import static org.apache.spark.sql.functions.*;
13 -import static org.apache.spark.sql.functions.lit;
14 -import static org.apache.spark.sql.functions.when;
15 -
16 -public class Aggregation {
17 -
18 - public static void main(String[] args) throws Exception {
19 -
20 - //Create Session
21 - SparkSession spark = SparkSession
22 - .builder()
23 - .appName("Detecting Fraud Clicks")
24 - .master("local")
25 - .getOrCreate();
26 -
27 - // Aggregation
28 - Aggregation agg = new Aggregation();
29 -
30 - Dataset<Row> dataset = agg.loadCSVDataSet("/home/chris/.kaggle/competitions/talkingdata-adtracking-fraud-detection/mnt/ssd/kaggle-talkingdata2/competition_files/train_sample.csv", spark);
31 - dataset = agg.changeTimestempToLong(dataset);
32 - dataset = agg.averageValidClickCount(dataset);
33 - dataset = agg.clickTimeDelta(dataset);
34 - dataset = agg.countClickInTenMinutes(dataset);
35 -
36 - long start = System.currentTimeMillis();
37 -
38 - List<String> logs_with_features = dataset.map(row->row.toString(), Encoders.STRING()).collectAsList();
39 - String[][] contents = new String[(int)dataset.count()][11];
40 - for (int i =0; i<logs_with_features.size();i++){
41 - String str_to_split = logs_with_features.get(i);
42 - String[] tmp = str_to_split.substring(1,str_to_split.length()-1).split(",");
43 - contents[i] = tmp;
44 - }
45 -
46 - long end = System.currentTimeMillis();
47 - System.out.println("JK's Procedure time elapsed : " + (end-start)/1000.0);
48 -
49 - start = System.currentTimeMillis();
50 - List<String> stringDataset = dataset.toJSON().collectAsList();
51 - end = System.currentTimeMillis();
52 - System.out.println("Steve's Procedure 1 time elapsed : " + (end-start)/1000.0);
53 - new GUI(stringDataset, contents);
54 -
55 -
56 - }
57 -
58 -
59 - private Dataset<Row> loadCSVDataSet(String path, SparkSession spark){
60 - // Read SCV to DataSet
61 - return spark.read().format("csv")
62 - .option("inferSchema", "true")
63 - .option("header", "true")
64 - .load(path);
65 - }
66 -
67 - private Dataset<Row> changeTimestempToLong(Dataset<Row> dataset){
68 - // cast timestamp to long
69 - Dataset<Row> newDF = dataset.withColumn("utc_click_time", dataset.col("click_time").cast("long"));
70 - newDF = newDF.withColumn("utc_attributed_time", dataset.col("attributed_time").cast("long"));
71 - newDF = newDF.drop("click_time").drop("attributed_time");
72 - return newDF;
73 - }
74 -
75 - private Dataset<Row> averageValidClickCount(Dataset<Row> dataset){
76 - // set Window partition by 'ip' and 'app' order by 'utc_click_time' select rows between 1st row to current row
77 - WindowSpec w = Window.partitionBy("ip", "app")
78 - .orderBy("utc_click_time")
79 - .rowsBetween(Window.unboundedPreceding(), Window.currentRow());
80 -
81 - // aggregation
82 - Dataset<Row> newDF = dataset.withColumn("cum_count_click", count("utc_click_time").over(w));
83 - newDF = newDF.withColumn("cum_sum_attributed", sum("is_attributed").over(w));
84 - newDF = newDF.withColumn("avg_valid_click_count", col("cum_sum_attributed").divide(col("cum_count_click")));
85 - newDF = newDF.drop("cum_count_click", "cum_sum_attributed");
86 - return newDF;
87 - }
88 -
89 - private Dataset<Row> clickTimeDelta(Dataset<Row> dataset){
90 - WindowSpec w = Window.partitionBy ("ip")
91 - .orderBy("utc_click_time");
92 -
93 - Dataset<Row> newDF = dataset.withColumn("lag(utc_click_time)", lag("utc_click_time",1).over(w));
94 - newDF = newDF.withColumn("click_time_delta", when(col("lag(utc_click_time)").isNull(),
95 - lit(0)).otherwise(col("utc_click_time")).minus(when(col("lag(utc_click_time)").isNull(),
96 - lit(0)).otherwise(col("lag(utc_click_time)"))));
97 - newDF = newDF.drop("lag(utc_click_time)");
98 - return newDF;
99 - }
100 -
101 - private Dataset<Row> countClickInTenMinutes(Dataset<Row> dataset){
102 - WindowSpec w = Window.partitionBy("ip")
103 - .orderBy("utc_click_time")
104 - .rangeBetween(Window.currentRow(),Window.currentRow()+600);
105 -
106 - Dataset<Row> newDF = dataset.withColumn("count_click_in_ten_mins",
107 - (count("utc_click_time").over(w)).minus(1)); //TODO 본인것 포함할 것인지 정해야함.
108 - return newDF;
109 - }
110 -}
...\ No newline at end of file ...\ No newline at end of file
...@@ -4,22 +4,15 @@ import org.apache.spark.sql.Row; ...@@ -4,22 +4,15 @@ import org.apache.spark.sql.Row;
4 import javax.swing.*; 4 import javax.swing.*;
5 import java.awt.*; 5 import java.awt.*;
6 import java.io.BufferedReader; 6 import java.io.BufferedReader;
7 +import java.io.File;
7 import java.io.StringReader; 8 import java.io.StringReader;
8 -import java.sql.ResultSet;
9 -import java.sql.ResultSetMetaData;
10 -import java.sql.Statement;
11 import java.util.List; 9 import java.util.List;
12 -import java.util.Vector; 10 +
13 import java.awt.BorderLayout; 11 import java.awt.BorderLayout;
14 import java.awt.GridLayout; 12 import java.awt.GridLayout;
15 import java.awt.event.ActionEvent; 13 import java.awt.event.ActionEvent;
16 import java.awt.event.ActionListener; 14 import java.awt.event.ActionListener;
17 -import java.sql.Connection; 15 +
18 -import java.sql.DriverManager;
19 -import java.sql.ResultSet;
20 -import java.sql.ResultSetMetaData;
21 -import java.sql.Statement;
22 -import java.util.Vector;
23 16
24 import javax.swing.JButton; 17 import javax.swing.JButton;
25 import javax.swing.JFrame; 18 import javax.swing.JFrame;
...@@ -28,31 +21,35 @@ import javax.swing.JPanel; ...@@ -28,31 +21,35 @@ import javax.swing.JPanel;
28 import javax.swing.JScrollPane; 21 import javax.swing.JScrollPane;
29 import javax.swing.JTable; 22 import javax.swing.JTable;
30 import javax.swing.JTextField; 23 import javax.swing.JTextField;
31 -import javax.swing.table.AbstractTableModel; 24 +import javax.swing.filechooser.FileFilter;
32 import javax.swing.table.DefaultTableModel; 25 import javax.swing.table.DefaultTableModel;
33 26
27 +import org.apache.spark.sql.SparkSession;
28 +import org.apache.spark.sql.expressions.Window;
29 +import org.apache.spark.sql.expressions.WindowSpec;
30 +
31 +import static org.apache.spark.sql.functions.*;
32 +import static org.apache.spark.sql.functions.lit;
33 +import static org.apache.spark.sql.functions.when;
34 +
34 public class GUI extends JFrame { 35 public class GUI extends JFrame {
35 JTabbedPane tab = new JTabbedPane(); 36 JTabbedPane tab = new JTabbedPane();
36 37
37 - public GUI(List<String> q, String[][] data) { 38 + public GUI() {
38 super("CESCO"); 39 super("CESCO");
39 - 40 + tab.addTab("main", new CreateTable_tab());
40 - tab.addTab("png", new PngPane()); 41 + tab.addTab("graphics", new PngPane());
41 - tab.addTab("gif", new GifPane());
42 - tab.addTab("jpg", new JpgPane());
43 - tab.addTab("table", new createTable(q));
44 - tab.addTab("processed_features", new createTable_alter(data));
45 -
46 add(tab); 42 add(tab);
47 43
48 - setSize(800, 500); // 윈도우의 크기 가로x세로 44 + setSize(1280, 1024); // 윈도우의 크기 가로x세로
49 setVisible(true); // 창을 보여줄떄 true, 숨길때 false 45 setVisible(true); // 창을 보여줄떄 true, 숨길때 false
50 setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); // x 버튼을 눌렀을때 종료 46 setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); // x 버튼을 눌렀을때 종료
51 } 47 }
52 48
53 -// public static void main(String args[]) { 49 + public static void main(String args[]) {
54 -// new GUI(); 50 +
55 -// } 51 + new GUI();
52 + }
56 } 53 }
57 54
58 class PngPane extends JPanel { 55 class PngPane extends JPanel {
...@@ -65,61 +62,147 @@ class PngPane extends JPanel { ...@@ -65,61 +62,147 @@ class PngPane extends JPanel {
65 } 62 }
66 } 63 }
67 64
68 -class GifPane extends JPanel { 65 +class CreateTable_tab extends JPanel{
69 - public GifPane() { 66 + public JPanel centre_pane = new JPanel();
67 + public JPanel south_pane = new JPanel();
68 +
69 + public JScrollPane pan1 = new JScrollPane();
70 + public JTable table1 = new JTable();
71 + public JButton btn1 = new JButton("CONFIRM");
72 +
73 + public JScrollPane pan2 = new JScrollPane();
74 + public JTable table2 = new JTable();
75 +
76 +
77 + public JScrollPane pan3 = new JScrollPane();
78 + public JTable table3 = new JTable();
79 +
80 + private DefaultTableModel tableModel1 = new DefaultTableModel(new Object[]{"unknown"},1);
81 + private DefaultTableModel tableModel2 = new DefaultTableModel(new Object[]{"unknown"},1);
82 + private DefaultTableModel tableModel3 = new DefaultTableModel(new Object[]{"unknown"},1);
83 +
84 + public CsvFile_chooser temp = new CsvFile_chooser();
85 +
86 +
87 + public CreateTable_tab(){
70 super(); 88 super();
71 - ImageIcon image = new ImageIcon("data/model.gif");
72 - JLabel label = new JLabel("", image, JLabel.CENTER);
73 setLayout(new BorderLayout()); 89 setLayout(new BorderLayout());
74 - add(label, BorderLayout.CENTER); 90 +
91 + //csvFile_chooser
92 + add(temp, BorderLayout.NORTH);
93 +
94 + // sub Panel 1
95 + centre_pane.setLayout(new GridLayout(1, 3));
96 + pan1.setViewportView(table1);
97 + centre_pane.add(pan1);
98 +
99 + // sub Panel 2
100 + pan2.setViewportView(table2);
101 + centre_pane.add(pan2);
102 +
103 + // sub Panel 3
104 + pan3.setViewportView(table3);
105 + centre_pane.add(pan3);
106 +
107 + //sub Panel 4
108 + south_pane.setLayout(new FlowLayout());
109 + south_pane.add(btn1);
110 + btn1.addActionListener(new ActionListener() {
111 + @Override
112 + public void actionPerformed(ActionEvent e) {
113 + if(temp.is_selected) {
114 + String path = temp.selected_file.getAbsolutePath();
115 + // 1st Column Raw Data
116 + SparkSession spark = SparkSession
117 + .builder()
118 + .appName("Detecting Fraud Clicks")
119 + .master("local")
120 + .getOrCreate();
121 +
122 + // Aggregation
123 + Aggregation agg = new Aggregation();
124 +
125 + // Raw data
126 + TableCreator table_maker = new TableCreator();
127 +
128 + Dataset<Row> dataset = agg.loadCSVDataSet(path, spark);
129 + List<String> stringDataset_Raw = dataset.toJSON().collectAsList();
130 + String[] header_r = {"ip", "app", "device", "os", "channel", "click_time", "is_attributed"};
131 + table1.setModel(table_maker.getTableModel(stringDataset_Raw, header_r));
132 +
133 + // 2nd Column Data with features
134 + // Adding features
135 + dataset = agg.changeTimestempToLong(dataset);
136 + dataset = agg.averageValidClickCount(dataset);
137 + dataset = agg.clickTimeDelta(dataset);
138 + dataset = agg.countClickInTenMinutes(dataset);
139 + List<String> stringDataset_feat = dataset.toJSON().collectAsList();
140 + String[] header_f = {"ip", "app", "device", "os", "channel", "is_attributed", "click_time",
141 + "avg_valid_click_count", "click_time_delta", "count_click_in_ten_mins"};
142 + table2.setModel(table_maker.getTableModel(stringDataset_feat, header_f));
143 +
144 +
145 + // 3nd Column Final results
146 +
147 +
75 } 148 }
76 -} 149 + }
150 + });
151 + add(centre_pane, BorderLayout.CENTER);
152 + add(south_pane, BorderLayout.SOUTH);
153 +
77 154
78 -class JpgPane extends JPanel {
79 - public JpgPane() {
80 - super();
81 - ImageIcon image = new ImageIcon("data/model.jpg");
82 - JLabel label = new JLabel("", image, JLabel.CENTER);
83 - setLayout(new BorderLayout());
84 - add(label, BorderLayout.CENTER);
85 } 155 }
86 } 156 }
87 157
88 -class createTable_alter extends JPanel{ 158 +class CsvFile_chooser extends JPanel{
89 - private String[] header = {"ip","app","device","os","channel","is_attributed","click_time","attributed_time", 159 + private JFileChooser chooser = new JFileChooser();
90 - "avg_valid_click_count","click_time_delta","count_click_in_tenmin"}; 160 + private JTextField path_field = new JTextField(30);
91 -/* 161 + private JButton browser = new JButton("...");
92 -root 162 + public File selected_file;
93 - |-- ip: integer (nullable = true) 163 + boolean is_selected = false;
94 - |-- app: integer (nullable = true) 164 + public CsvFile_chooser(){
95 - |-- device: integer (nullable = true) 165 + setLayout(new FlowLayout());
96 - |-- os: integer (nullable = true) 166 + chooser.addChoosableFileFilter(new FileFilter() {
97 - |-- channel: integer (nullable = true) 167 + @Override
98 - |-- is_attributed: integer (nullable = true) 168 + public boolean accept(File f) {
99 - |-- utc_click_time: long (nullable = true) 169 + if (f.isDirectory()) {
100 - |-- utc_attributed_time: long (nullable = true) 170 + return true;
101 - |-- avg_valid_click_count: double (nullable = true) 171 + } else {
102 - |-- click_time_delta: long (nullable = true) 172 + return f.getName().toLowerCase().endsWith(".csv");
103 - |-- count_click_in_ten_mins: long (nullable = false) 173 + }
104 - */
105 - public createTable_alter(String[][] data){
106 - JTable processed_table = new JTable(data, header);
107 - JScrollPane jScrollPane = new JScrollPane(processed_table);
108 - add(jScrollPane);
109 } 174 }
110 -}
111 175
112 -class createTable extends JPanel { 176 + @Override
113 - long start = System.currentTimeMillis(); 177 + public String getDescription() {
114 - public createTable(List<String> data) { //constructor : display table 178 + return "CSV files (*.csv)";
115 - getTableModel(data); 179 + }
180 + });
181 + add(path_field);
182 + add(browser);
183 + browser.addActionListener(new ActionListener(){
184 + @Override
185 + public void actionPerformed(ActionEvent e) {
186 + Object obj = e.getSource();
187 + if((JButton)obj == browser){
188 + if(chooser.showOpenDialog(null) == JFileChooser.APPROVE_OPTION){
189 + selected_file = chooser.getSelectedFile();
190 + String path = selected_file.getAbsolutePath();
191 + path_field.setText(path);
192 + is_selected = true;
193 + }
194 + }
116 } 195 }
196 + });
197 + }
198 +}
117 199
118 - private DefaultTableModel getTableModel(List<String> data) { 200 +class TableCreator extends JPanel {
119 - String column_n[]={"ip","app","device","os","channel","is_attributed","click_time", 201 + public DefaultTableModel model;
120 - "avg_valid_click_count","click_time_delta","count_click_in_tenmin"}; 202 +
203 + public DefaultTableModel getTableModel(List<String> data, String[] header) {
121 Object tabledata[][]={}; 204 Object tabledata[][]={};
122 - DefaultTableModel model = new DefaultTableModel(tabledata,column_n); 205 + DefaultTableModel model = new DefaultTableModel(tabledata,header);
123 JTable jtable = new JTable(model); 206 JTable jtable = new JTable(model);
124 JScrollPane jScollPane = new JScrollPane(jtable); 207 JScrollPane jScollPane = new JScrollPane(jtable);
125 add(jScollPane); 208 add(jScollPane);
...@@ -131,11 +214,9 @@ class createTable extends JPanel { ...@@ -131,11 +214,9 @@ class createTable extends JPanel {
131 214
132 line = line.replace("\"", ""); 215 line = line.replace("\"", "");
133 line = line.replace("_", ""); 216 line = line.replace("_", "");
134 - //line = line.replace("\\{","");
135 line = line.replaceAll("\\{|\\}",""); 217 line = line.replaceAll("\\{|\\}","");
136 line = line.replaceAll("\\w+:", ""); 218 line = line.replaceAll("\\w+:", "");
137 219
138 - //System.out.println(line);
139 Object [] temp= line.split(","); 220 Object [] temp= line.split(",");
140 221
141 model.addRow(temp); 222 model.addRow(temp);
...@@ -146,8 +227,6 @@ class createTable extends JPanel { ...@@ -146,8 +227,6 @@ class createTable extends JPanel {
146 } catch (Exception e) { 227 } catch (Exception e) {
147 System.out.println(e); 228 System.out.println(e);
148 } 229 }
149 - long end = System.currentTimeMillis();
150 - System.out.println("Steve's Procedure2 time elapsed : " + (end-start)/1000.0);
151 230
152 return model; 231 return model;
153 } 232 }
...@@ -163,3 +242,59 @@ class createTable extends JPanel { ...@@ -163,3 +242,59 @@ class createTable extends JPanel {
163 return reader; 242 return reader;
164 } 243 }
165 } 244 }
245 +
246 +class Aggregation {
247 +
248 + public Dataset<Row> loadCSVDataSet(String path, SparkSession spark){
249 + // Read SCV to DataSet
250 + return spark.read().format("csv")
251 + .option("inferSchema", "true")
252 + .option("header", "true")
253 + .load(path);
254 +
255 + }
256 +
257 + public Dataset<Row> changeTimestempToLong(Dataset<Row> dataset){
258 + // cast timestamp to long
259 + Dataset<Row> newDF = dataset.withColumn("utc_click_time", dataset.col("click_time").cast("long"));
260 + newDF = newDF.withColumn("utc_attributed_time", dataset.col("attributed_time").cast("long"));
261 + newDF = newDF.drop("click_time").drop("attributed_time");
262 + return newDF;
263 + }
264 +
265 + public Dataset<Row> averageValidClickCount(Dataset<Row> dataset){
266 + // set Window partition by 'ip' and 'app' order by 'utc_click_time' select rows between 1st row to current row
267 + WindowSpec w = Window.partitionBy("ip", "app")
268 + .orderBy("utc_click_time")
269 + .rowsBetween(Window.unboundedPreceding(), Window.currentRow());
270 +
271 + // aggregation
272 + Dataset<Row> newDF = dataset.withColumn("cum_count_click", count("utc_click_time").over(w));
273 + newDF = newDF.withColumn("cum_sum_attributed", sum("is_attributed").over(w));
274 + newDF = newDF.withColumn("avg_valid_click_count", col("cum_sum_attributed").divide(col("cum_count_click")));
275 + newDF = newDF.drop("cum_count_click", "cum_sum_attributed");
276 + return newDF;
277 + }
278 +
279 + public Dataset<Row> clickTimeDelta(Dataset<Row> dataset){
280 + WindowSpec w = Window.partitionBy ("ip")
281 + .orderBy("utc_click_time");
282 +
283 + Dataset<Row> newDF = dataset.withColumn("lag(utc_click_time)", lag("utc_click_time",1).over(w));
284 + newDF = newDF.withColumn("click_time_delta", when(col("lag(utc_click_time)").isNull(),
285 + lit(0)).otherwise(col("utc_click_time")).minus(when(col("lag(utc_click_time)").isNull(),
286 + lit(0)).otherwise(col("lag(utc_click_time)"))));
287 + newDF = newDF.drop("lag(utc_click_time)");
288 + return newDF;
289 + }
290 +
291 + public Dataset<Row> countClickInTenMinutes(Dataset<Row> dataset){
292 + WindowSpec w = Window.partitionBy("ip")
293 + .orderBy("utc_click_time")
294 + .rangeBetween(Window.currentRow(),Window.currentRow()+600);
295 +
296 + Dataset<Row> newDF = dataset.withColumn("count_click_in_ten_mins",
297 + (count("utc_click_time").over(w)).minus(1)); //TODO 본인것 포함할 것인지 정해야함.
298 + return newDF;
299 + }
300 +}
...\ No newline at end of file ...\ No newline at end of file
......