tnt-ooo-tnt

Java-Cesco/Detecting_fraud_clicks#13-1

1 +import org.apache.spark.api.java.function.MapFunction;
2 +import org.apache.spark.sql.Dataset;
3 +import org.apache.spark.sql.Encoders;
4 +import org.apache.spark.sql.Row;
5 +import org.apache.spark.sql.SparkSession;
6 +import org.apache.spark.sql.expressions.Window;
7 +import org.apache.spark.sql.expressions.WindowSpec;
8 +
9 +import java.util.ArrayList;
10 +import java.util.List;
11 +
12 +import static org.apache.spark.sql.functions.*;
13 +import static org.apache.spark.sql.functions.lit;
14 +import static org.apache.spark.sql.functions.when;
15 +
16 +public class Aggregation {
17 +
18 + public static void main(String[] args) throws Exception {
19 +
20 + //Create Session
21 + SparkSession spark = SparkSession
22 + .builder()
23 + .appName("Detecting Fraud Clicks")
24 + .master("local")
25 + .getOrCreate();
26 +
27 + // Aggregation
28 + Aggregation agg = new Aggregation();
29 +
30 + Dataset<Row> dataset = agg.loadCSVDataSet("/home/chris/.kaggle/competitions/talkingdata-adtracking-fraud-detection/mnt/ssd/kaggle-talkingdata2/competition_files/train_sample.csv", spark);
31 + dataset = agg.changeTimestempToLong(dataset);
32 + dataset = agg.averageValidClickCount(dataset);
33 + dataset = agg.clickTimeDelta(dataset);
34 + dataset = agg.countClickInTenMinutes(dataset);
35 +
36 + long start = System.currentTimeMillis();
37 +
38 + List<String> logs_with_features = dataset.map(row->row.toString(), Encoders.STRING()).collectAsList();
39 + String[][] contents = new String[(int)dataset.count()][11];
40 + for (int i =0; i<logs_with_features.size();i++){
41 + String str_to_split = logs_with_features.get(i);
42 + String[] tmp = str_to_split.substring(1,str_to_split.length()-1).split(",");
43 + contents[i] = tmp;
44 + }
45 +
46 + long end = System.currentTimeMillis();
47 + System.out.println("JK's Procedure time elapsed : " + (end-start)/1000.0);
48 +
49 + start = System.currentTimeMillis();
50 + List<String> stringDataset = dataset.toJSON().collectAsList();
51 + end = System.currentTimeMillis();
52 + System.out.println("Steve's Procedure 1 time elapsed : " + (end-start)/1000.0);
53 + new GUI(stringDataset, contents);
54 +
55 +
56 + }
57 +
58 +
59 + private Dataset<Row> loadCSVDataSet(String path, SparkSession spark){
60 + // Read SCV to DataSet
61 + return spark.read().format("csv")
62 + .option("inferSchema", "true")
63 + .option("header", "true")
64 + .load(path);
65 + }
66 +
67 + private Dataset<Row> changeTimestempToLong(Dataset<Row> dataset){
68 + // cast timestamp to long
69 + Dataset<Row> newDF = dataset.withColumn("utc_click_time", dataset.col("click_time").cast("long"));
70 + newDF = newDF.withColumn("utc_attributed_time", dataset.col("attributed_time").cast("long"));
71 + newDF = newDF.drop("click_time").drop("attributed_time");
72 + return newDF;
73 + }
74 +
75 + private Dataset<Row> averageValidClickCount(Dataset<Row> dataset){
76 + // set Window partition by 'ip' and 'app' order by 'utc_click_time' select rows between 1st row to current row
77 + WindowSpec w = Window.partitionBy("ip", "app")
78 + .orderBy("utc_click_time")
79 + .rowsBetween(Window.unboundedPreceding(), Window.currentRow());
80 +
81 + // aggregation
82 + Dataset<Row> newDF = dataset.withColumn("cum_count_click", count("utc_click_time").over(w));
83 + newDF = newDF.withColumn("cum_sum_attributed", sum("is_attributed").over(w));
84 + newDF = newDF.withColumn("avg_valid_click_count", col("cum_sum_attributed").divide(col("cum_count_click")));
85 + newDF = newDF.drop("cum_count_click", "cum_sum_attributed");
86 + return newDF;
87 + }
88 +
89 + private Dataset<Row> clickTimeDelta(Dataset<Row> dataset){
90 + WindowSpec w = Window.partitionBy ("ip")
91 + .orderBy("utc_click_time");
92 +
93 + Dataset<Row> newDF = dataset.withColumn("lag(utc_click_time)", lag("utc_click_time",1).over(w));
94 + newDF = newDF.withColumn("click_time_delta", when(col("lag(utc_click_time)").isNull(),
95 + lit(0)).otherwise(col("utc_click_time")).minus(when(col("lag(utc_click_time)").isNull(),
96 + lit(0)).otherwise(col("lag(utc_click_time)"))));
97 + newDF = newDF.drop("lag(utc_click_time)");
98 + return newDF;
99 + }
100 +
101 + private Dataset<Row> countClickInTenMinutes(Dataset<Row> dataset){
102 + WindowSpec w = Window.partitionBy("ip")
103 + .orderBy("utc_click_time")
104 + .rangeBetween(Window.currentRow(),Window.currentRow()+600);
105 +
106 + Dataset<Row> newDF = dataset.withColumn("count_click_in_ten_mins",
107 + (count("utc_click_time").over(w)).minus(1)); //TODO 본인것 포함할 것인지 정해야함.
108 + return newDF;
109 + }
110 +}
...\ No newline at end of file ...\ No newline at end of file
1 +import org.apache.spark.sql.Dataset;
2 +import org.apache.spark.sql.Row;
3 +
4 +import javax.swing.*;
5 +import java.awt.*;
6 +import java.io.BufferedReader;
7 +import java.io.StringReader;
8 +import java.sql.ResultSet;
9 +import java.sql.ResultSetMetaData;
10 +import java.sql.Statement;
11 +import java.util.List;
12 +import java.util.Vector;
13 +import java.awt.BorderLayout;
14 +import java.awt.GridLayout;
15 +import java.awt.event.ActionEvent;
16 +import java.awt.event.ActionListener;
17 +import java.sql.Connection;
18 +import java.sql.DriverManager;
19 +import java.sql.ResultSet;
20 +import java.sql.ResultSetMetaData;
21 +import java.sql.Statement;
22 +import java.util.Vector;
23 +
24 +import javax.swing.JButton;
25 +import javax.swing.JFrame;
26 +import javax.swing.JLabel;
27 +import javax.swing.JPanel;
28 +import javax.swing.JScrollPane;
29 +import javax.swing.JTable;
30 +import javax.swing.JTextField;
31 +import javax.swing.table.AbstractTableModel;
32 +import javax.swing.table.DefaultTableModel;
33 +
34 +public class GUI extends JFrame {
35 + JTabbedPane tab = new JTabbedPane();
36 +
37 + public GUI(List<String> q, String[][] data) {
38 + super("CESCO");
39 +
40 + tab.addTab("png", new PngPane());
41 + tab.addTab("gif", new GifPane());
42 + tab.addTab("jpg", new JpgPane());
43 + tab.addTab("table", new createTable(q));
44 + tab.addTab("processed_features", new createTable_alter(data));
45 +
46 + add(tab);
47 +
48 + setSize(800, 500); // 윈도우의 크기 가로x세로
49 + setVisible(true); // 창을 보여줄떄 true, 숨길때 false
50 + setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); // x 버튼을 눌렀을때 종료
51 + }
52 +
53 +// public static void main(String args[]) {
54 +// new GUI();
55 +// }
56 +}
57 +
58 +class PngPane extends JPanel {
59 + public PngPane() {
60 + super();
61 + ImageIcon image = new ImageIcon("data/model.png");
62 + JLabel label = new JLabel("", image, JLabel.CENTER);
63 + setLayout(new BorderLayout());
64 + add(label, BorderLayout.CENTER);
65 + }
66 +}
67 +
68 +class GifPane extends JPanel {
69 + public GifPane() {
70 + super();
71 + ImageIcon image = new ImageIcon("data/model.gif");
72 + JLabel label = new JLabel("", image, JLabel.CENTER);
73 + setLayout(new BorderLayout());
74 + add(label, BorderLayout.CENTER);
75 + }
76 +}
77 +
78 +class JpgPane extends JPanel {
79 + public JpgPane() {
80 + super();
81 + ImageIcon image = new ImageIcon("data/model.jpg");
82 + JLabel label = new JLabel("", image, JLabel.CENTER);
83 + setLayout(new BorderLayout());
84 + add(label, BorderLayout.CENTER);
85 + }
86 +}
87 +
88 +class createTable_alter extends JPanel{
89 + private String[] header = {"ip","app","device","os","channel","is_attributed","click_time","attributed_time",
90 + "avg_valid_click_count","click_time_delta","count_click_in_tenmin"};
91 +/*
92 +root
93 + |-- ip: integer (nullable = true)
94 + |-- app: integer (nullable = true)
95 + |-- device: integer (nullable = true)
96 + |-- os: integer (nullable = true)
97 + |-- channel: integer (nullable = true)
98 + |-- is_attributed: integer (nullable = true)
99 + |-- utc_click_time: long (nullable = true)
100 + |-- utc_attributed_time: long (nullable = true)
101 + |-- avg_valid_click_count: double (nullable = true)
102 + |-- click_time_delta: long (nullable = true)
103 + |-- count_click_in_ten_mins: long (nullable = false)
104 + */
105 + public createTable_alter(String[][] data){
106 + JTable processed_table = new JTable(data, header);
107 + JScrollPane jScrollPane = new JScrollPane(processed_table);
108 + add(jScrollPane);
109 + }
110 +}
111 +
112 +class createTable extends JPanel {
113 + long start = System.currentTimeMillis();
114 + public createTable(List<String> data) { //constructor : display table
115 + getTableModel(data);
116 + }
117 +
118 + private DefaultTableModel getTableModel(List<String> data) {
119 + String column_n[]={"ip","app","device","os","channel","is_attributed","click_time",
120 + "avg_valid_click_count","click_time_delta","count_click_in_tenmin"};
121 + Object tabledata[][]={};
122 + DefaultTableModel model = new DefaultTableModel(tabledata,column_n);
123 + JTable jtable = new JTable(model);
124 + JScrollPane jScollPane = new JScrollPane(jtable);
125 + add(jScollPane);
126 + try {
127 + for(int i =0; i<data.size();i++){
128 + BufferedReader reader = getFileReader(data.get(i));
129 + String line = reader.readLine();
130 +
131 +
132 + line = line.replace("\"", "");
133 + line = line.replace("_", "");
134 + //line = line.replace("\\{","");
135 + line = line.replaceAll("\\{|\\}","");
136 + line = line.replaceAll("\\w+:", "");
137 +
138 + //System.out.println(line);
139 + Object [] temp= line.split(",");
140 +
141 + model.addRow(temp);
142 +
143 + reader.close();
144 + }
145 +
146 + } catch (Exception e) {
147 + System.out.println(e);
148 + }
149 + long end = System.currentTimeMillis();
150 + System.out.println("Steve's Procedure2 time elapsed : " + (end-start)/1000.0);
151 +
152 + return model;
153 + }
154 +
155 + private BufferedReader getFileReader(String data) {
156 +
157 + BufferedReader reader = new BufferedReader(new StringReader(data));
158 +
159 + // In your real application the data would come from a file
160 +
161 + //Reader reader = new BufferedReader( new FileReader(...) );
162 +
163 + return reader;
164 + }
165 +}
...\ No newline at end of file ...\ No newline at end of file
1 -import org.apache.spark.SparkConf;
2 -import org.apache.spark.api.java.JavaSparkContext;
3 -import org.apache.spark.sql.Dataset;
4 -import org.apache.spark.sql.Row;
5 -import org.apache.spark.sql.SparkSession;
6 -import org.apache.spark.sql.expressions.Window;
7 -import org.apache.spark.sql.expressions.WindowSpec;
8 -
9 -import javax.xml.crypto.Data;
10 -
11 -import static org.apache.spark.sql.functions.*;
12 -
13 -public class calForwardTimeDelta {
14 - static SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("Cesco");
15 - static JavaSparkContext sc = new JavaSparkContext(conf);
16 -
17 - public static void main(String[] args) throws Exception{
18 - //Create Session
19 - SparkSession spark = SparkSession
20 - .builder()
21 - .appName("Detecting Fraud Clicks")
22 - .getOrCreate();
23 -
24 - //run methods here
25 - calcDelta(spark);
26 - }
27 -
28 - private static void calcDelta(SparkSession spark){
29 - // put the path the file you gonna deal with being placed
30 - String filepath = "/home/chris/.kaggle/competitions/talkingdata-adtracking-fraud-detection/mnt/ssd/kaggle-talkingdata2/competition_files/train_sample.csv";
31 -
32 - // create Dataset from files
33 - Dataset<Row> logDF = spark.read()
34 - .format("csv")
35 - .option("inferSchema", "true")
36 - .option("header","true")
37 - .load(filepath);
38 -
39 - // cast timestamp(click_time, attributed_time) type to long type
40 -
41 - //add column for long(click_time)
42 - Dataset<Row> newDF = logDF.withColumn("utc_click_time", logDF.col("click_time").cast("long"));
43 - //add column for long(attributed_time)
44 - newDF = newDF.withColumn("utc_attributed_time", logDF.col("attributed_time").cast("long"));
45 - //drop timestamp type columns
46 - newDF = newDF.drop("click_time").drop("attributed_time");
47 - newDF.createOrReplaceTempView("logs");
48 -
49 - WindowSpec w = Window.partitionBy ("ip")
50 - .orderBy("utc_click_time");
51 -
52 - newDF = newDF.withColumn("lag(utc_click_time)", lag("utc_click_time",1).over(w));
53 - newDF.where("ip=10").show();
54 - newDF = newDF.withColumn("delta", when(col("lag(utc_click_time)").isNull(),lit(0)).otherwise(col("utc_click_time")).minus(when(col("lag(utc_click_time)").isNull(),lit(0)).otherwise(col("lag(utc_click_time)"))));
55 - //newDF = newDF.withColumn("delta", datediff());
56 - newDF = newDF.drop("lag(utc_click_time)");
57 - newDF = newDF.orderBy("ip");
58 -
59 - newDF.show();
60 - }
61 -
62 -}