최재은

Add : Event Embedding code & data, 최종 보고서 수정

Showing 41 changed files with 984 additions and 0 deletions
This diff could not be displayed because it is too large.
This diff is collapsed. Click to expand it.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
1 +import numpy as np
2 +
3 +print(np.load("./resultEmbeding.pickle", allow_pickle=True))
4 +D
...\ No newline at end of file ...\ No newline at end of file
1 +{"class_name": "Sequential", "config": [{"class_name": "Flatten", "config": {"name": "flatten_1", "trainable": true, "batch_input_shape": [null, 5, 80], "dtype": "float32"}}, {"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "units": 512, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "rate": 0.8}}, {"class_name": "Dense", "config": {"name": "dense_2", "trainable": true, "units": 1024, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout_2", "trainable": true, "rate": 0.8}}, {"class_name": "Dropout", "config": {"name": "dropout_3", "trainable": true, "rate": 0.8}}, {"class_name": "Dense", "config": {"name": "dense_3", "trainable": true, "units": 2, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}], "keras_version": "2.0.2", "backend": "theano"}
...\ No newline at end of file ...\ No newline at end of file
1 +{"class_name": "Sequential", "config": [{"class_name": "Conv1D", "config": {"name": "conv1d_1", "trainable": true, "batch_input_shape": [null, 5, 80], "dtype": "float32", "filters": 128, "kernel_size": [1], "strides": [1], "padding": "valid", "dilation_rate": [1], "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Conv1D", "config": {"name": "conv1d_2", "trainable": true, "filters": 128, "kernel_size": [3], "strides": [1], "padding": "same", "dilation_rate": [1], "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "MaxPooling1D", "config": {"name": "max_pooling1d_1", "trainable": true, "strides": [2], "pool_size": [2], "padding": "valid"}}, {"class_name": "Flatten", "config": {"name": "flatten_1", "trainable": true}}, {"class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "rate": 0.8}}, {"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "units": 2, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}], "keras_version": "2.0.2", "backend": "theano"}
...\ No newline at end of file ...\ No newline at end of file
1 +{"class_name": "Sequential", "config": [{"class_name": "LSTM", "config": {"name": "lstm_1", "trainable": true, "batch_input_shape": [null, 5, 80], "dtype": "float32", "return_sequences": false, "go_backwards": false, "stateful": false, "unroll": false, "implementation": 0, "units": 256, "activation": "tanh", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": {"class_name": "L1L2", "config": {"l1": 0.0, "l2": 0.0010000000474974513}}, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0}}, {"class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "rate": 0.6}}, {"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "units": 2, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": {"class_name": "L1L2", "config": {"l1": 0.0, "l2": 0.0010000000474974513}}, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}], "keras_version": "2.0.2", "backend": "theano"}
...\ No newline at end of file ...\ No newline at end of file
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
1 +#!/usr/bin/env python3
2 +# -*- coding: utf-8 -*-
3 +"""
4 +Created on Mon Mar 20 23:41:51 2017
5 +
6 +@author: red-sky
7 +"""
8 +
9 +
10 +import numpy as np
11 +import theano
12 +from theano import tensor as T
13 +
14 +
15 +class EmbeddingLayer(object):
16 + def __init__(self, num_vocab, word_dim, rng, embedding_w=None):
17 + '''
18 + word_dim :: dimension of the word embeddings
19 + num_vocab :: number of word embeddings in the vocabulary
20 + embedding_w :: pre-train word vector
21 + '''
22 +
23 + if embedding_w is None:
24 + word_vectors = rng.uniform(-1.0, 1.0, (num_vocab, word_dim))
25 + self.embedding_w = theano.shared(word_vectors,
26 + name="EmbeddingLayer_W") \
27 + .astype(theano.config.floatX)
28 + else:
29 + self.embedding_w = theano.shared(embedding_w,
30 + name="EmbeddingLayer_W") \
31 + .astype(theano.config.floatX)
32 +
33 + self.params = [self.embedding_w]
34 + self.infor = [num_vocab, word_dim]
35 +
36 + def words_ind_2vec(self, index):
37 + map_word_vectors = self.embedding_w[index]
38 + output = T.mean(map_word_vectors, axis=0)
39 + return output, map_word_vectors
40 +
41 +
42 +if __name__ == "__main__":
43 + rng = np.random.RandomState(220495)
44 + arrWords = T.ivector("words")
45 + EMBD = EmbeddingLayer(100, 150, rng=rng)
46 + Word2Vec = theano.function(
47 + inputs=[arrWords],
48 + outputs=EMBD.words_ind_2vec(arrWords)
49 + )
50 + Vec = Word2Vec([1, 2, 3, 4])
51 + Vec = Word2Vec([2, 3, 4])
52 + print("Dim: ", Vec.shape)
53 + print("Val: ", Vec)
1 +#!/usr/bin/env python3
2 +# -*- coding: utf-8 -*-
3 +"""
4 +Created on Sat Mar 25 16:13:18 2017
5 +
6 +@author: red-sky
7 +"""
8 +
9 +import theano
10 +import numpy as np
11 +import theano.tensor as T
12 +from SmallUtils import createShareVar
13 +
14 +
15 +class RoleDependentLayer(object):
16 + def __init__(self, left_dependent, right_dependent, rng,
17 + n_in=100, n_out=4, trainedParams=None,
18 + name="RoleDependentEmbedding_"):
19 + if trainedParams is None:
20 + trainedParams = {
21 + name: {
22 + "T": None, "W1": None, "W2": None, "b": None
23 + }
24 + }
25 +
26 + if trainedParams[name]["T"] is not None:
27 + assert trainedParams[name]["T"].shape == (n_out, n_in, n_in)
28 + self.T = theano.shared(value=trainedParams[name]["T"],
29 + name=name+"T", borrow=True)
30 + else:
31 + self.T = createShareVar(rng=rng, name=name+"T",
32 + factor_for_init=n_out + n_in,
33 + dim=(n_out, n_in, n_in))
34 +
35 + if trainedParams[name]["W1"] is not None:
36 + assert trainedParams[name]["W1"].shape == (n_in, n_out)
37 + self.W1 = theano.shared(value=trainedParams[name]["W1"],
38 + name=name+"W1", borrow=True)
39 + else:
40 + self.W1 = createShareVar(rng=rng, name=name+"W1",
41 + factor_for_init=n_out + n_in,
42 + dim=(n_in, n_out))
43 +
44 + if trainedParams[name]["W2"] is not None:
45 + assert trainedParams[name]["W2"].shape == (n_in, n_out)
46 + self.W2 = theano.shared(value=trainedParams[name]["W2"],
47 + name=name+"W2", borrow=True)
48 + else:
49 + self.W2 = createShareVar(rng=rng, name=name+"W2",
50 + factor_for_init=n_out + n_in,
51 + dim=(n_in, n_out))
52 +
53 + if trainedParams[name]["b"] is not None:
54 + assert trainedParams[name]["b"].shape == (n_out,)
55 + self.b = theano.shared(value=trainedParams[name]["b"],
56 + name=name+"b", borrow=True)
57 + else:
58 + b_values = np.zeros(shape=(n_out,), dtype=theano.config.floatX)
59 + self.b = theano.shared(value=b_values, name=name+"b", borrow=True)
60 +
61 + # list of layer params
62 + self.params = [self.T, self.W1, self.W2, self.b]
63 +
64 + # L2 regulation
65 + self.L2 = sum([(param**2).sum() for param in self.params])
66 +
67 + # Bi-linear step
68 + def one_kernel(Tk, left, right):
69 + first_bi_libear = theano.dot(left, Tk)
70 + seccon_bi_linear = theano.dot(first_bi_libear, right)
71 + return(seccon_bi_linear.flatten())
72 +
73 + bi_1, _ = theano.scan(
74 + fn=one_kernel,
75 + sequences=[self.T],
76 + non_sequences=[left_dependent, right_dependent],
77 + n_steps=n_out
78 + )
79 +
80 + # Feed forward network step
81 + feedforward_step1 = theano.dot(left_dependent, self.W1)
82 + feedforward_step2 = theano.dot(right_dependent, self.W2)
83 + feedforward_step3 = (feedforward_step1 +
84 + feedforward_step2.dimshuffle("x", 0) +
85 + self.b.dimshuffle("x", 0))
86 + feedforward_step4 = bi_1.dimshuffle(1, 0) + feedforward_step3
87 + self.output = theano.tensor.tanh(feedforward_step4)
88 + self.test = [feedforward_step3]
89 +
90 + def output_(self, left_dependent, right_dependent):
91 +
92 + def one_kernel(Tk, left, right):
93 + first_bi_libear = theano.dot(left, Tk)
94 + seccon_bi_linear = theano.dot(first_bi_libear, right)
95 + return(seccon_bi_linear.flatten())
96 +
97 + bi_linear_tensor, _ = theano.scan(
98 + fn=one_kernel,
99 + sequences=[self.T],
100 + non_sequences=[left_dependent, right_dependent],
101 + n_steps=n_out
102 + )
103 +
104 + bi_linear_tensor = bi_linear_tensor.dimshuffle(1, 0)
105 + feedforward_step1 = theano.dot(left_dependent, self.W1)
106 + feedforward_step2 = theano.dot(right_dependent, self.W2)
107 + feedforward_step3 = (feedforward_step1 +
108 + feedforward_step2.dimshuffle("x", 0) +
109 + self.b.dimshuffle("x", 0))
110 + feedforward_step4 = bi_linear_tensor + feedforward_step3
111 + output = theano.tensor.tanh(feedforward_step4)
112 + return(output)
113 +
114 + def get_params(self):
115 + trainedParams = {
116 + "T": self.T.get_value(), "W1": self.W1.get_value(),
117 + "W2": self.W2.get_value(), "b": self.b.get_value()
118 + }
119 + return(trainedParams)
1 +#!/usr/bin/env python3
2 +# -*- coding: utf-8 -*-
3 +"""
4 +Created on Sat Mar 25 15:55:14 2017
5 +
6 +@author: red-sky
7 +"""
8 +import theano
9 +import theano.tensor as T
10 +import numpy as np
11 +
12 +def createShareVar(rng, dim, name, factor_for_init):
13 + var_values = np.asarray(
14 + rng.uniform(
15 + low=-np.sqrt(6.0 / factor_for_init),
16 + high=np.sqrt(6.0 / factor_for_init),
17 + size=dim,
18 + )
19 + )
20 + Var = theano.shared(value=var_values, name=name, borrow=True)
21 + return Var
22 +
23 +
24 +def adadelta(lr, tparams, cost, grads, listInput):
25 + """
26 + An adaptive learning rate optimizer
27 +
28 + Parameters
29 + ----------
30 + lr : Theano SharedVariable
31 + Initial learning rate
32 + tpramas: Theano SharedVariable
33 + Model parameters
34 + grads: Theano variable
35 + Gradients of cost w.r.t to parameres
36 +
37 + cost: Theano variable
38 + Objective fucntion to minimize
39 +
40 + Notes
41 + -----
42 + For more information, see [ADADELTA]_.
43 +
44 + .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
45 + Rate Method*, arXiv:1212.5701.
46 + """
47 + np_float = np.asarray(0., dtype=theano.config.floatX)
48 + zipped_grads = [theano.shared(p.get_value() * np_float,
49 + name='%s_grad' % k)
50 + for k, p in enumerate(tparams)]
51 + running_up2 = [theano.shared(p.get_value() * np_float,
52 + name='%s_rup2' % k)
53 + for k, p in enumerate(tparams)]
54 + running_grads2 = [theano.shared(p.get_value() * np_float,
55 + name='%s_rgrad2' % k)
56 + for k, p in enumerate(tparams)]
57 +
58 + zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
59 + rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
60 + for rg2, g in zip(running_grads2, grads)]
61 +
62 + f_grad_shared = theano.function(inputs=listInput,
63 + outputs=cost,
64 + updates=zgup + rg2up,
65 + name='adadelta_f_grad_shared')
66 +
67 + updir = [-T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg
68 + for zg, ru2, rg2 in zip(zipped_grads,
69 + running_up2,
70 + running_grads2)]
71 + ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
72 + for ru2, ud in zip(running_up2, updir)]
73 + param_up = [(p, p + ud) for p, ud in zip(tparams, updir)]
74 +
75 + f_update = theano.function([lr], [], updates=ru2up + param_up,
76 + on_unused_input='ignore',
77 + name='adadelta_f_update')
78 +
79 + return f_grad_shared, f_update
80 +
81 +
82 +def ADAM_OPTIMIZER(loss, all_params, learning_rate=0.001,
83 + b1=0.9, b2=0.999, e=1e-8, gamma=1-1e-8):
84 + """
85 + CITE: http://sebastianruder.com/optimizing-gradient-descent/index.html#adam
86 + ADAM update rules
87 + Default values are taken from [Kingma2014]
88 + References:
89 + [Kingma2014] Kingma, Diederik, and Jimmy Ba.
90 + "Adam: A Method for Stochastic Optimization."
91 + arXiv preprint arXiv:1412.6980 (2014).
92 + http://arxiv.org/pdf/1412.6980v4.pdf
93 + """
94 + updates = []
95 + all_grads = theano.grad(loss, all_params)
96 + alpha = learning_rate
97 + t = theano.shared(np.float32(1))
98 + # (Decay the first moment running average coefficient)
99 + b1_t = b1*gamma**(t-1)
100 +
101 + for params_previous, g in zip(all_params, all_grads):
102 + init_moment = np.zeros(params_previous.get_value().shape,
103 + dtype=theano.config.floatX)
104 + # (the mean)
105 + first_moment = theano.shared(init_moment)
106 + # (the uncentered variance)
107 + second_moment = theano.shared(init_moment)
108 +
109 + # (Update biased first moment estimate)
110 + bias_m = b1_t*first_moment + (1 - b1_t)*g
111 +
112 + # (Update biased second raw moment estimate)
113 + bias_v = b2*second_moment + (1 - b2)*g**2
114 +
115 + # (Compute bias-corrected first moment estimate)
116 + unbias_m = bias_m / (1-b1**t)
117 +
118 + # (Compute bias-corrected second raw moment estimate)
119 + unbias_v = bias_v / (1-b2**t)
120 +
121 + # (Update parameters)
122 + update_term = (alpha * unbias_m) / (T.sqrt(unbias_v) + e)
123 + params_new = params_previous - update_term
124 +
125 + updates.append((first_moment, bias_m))
126 + updates.append((second_moment, bias_v))
127 + updates.append((params_previous, params_new))
128 + updates.append((t, t + 1.))
129 + return updates
...\ No newline at end of file ...\ No newline at end of file
1 +#!/usr/bin/env python3
2 +# -*- coding: utf-8 -*-
3 +"""
4 +Created on Thu Apr 13 17:01:36 2017
5 +
6 +@author: red-sky
7 +"""
8 +
9 +import sys
10 +import numpy as np
11 +np.random.seed(280295)
12 +import keras.backend as K
13 +from keras.models import Sequential
14 +from keras.layers import Dense, Activation, Dropout, Flatten
15 +from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D
16 +from keras.callbacks import ModelCheckpoint, EarlyStopping
17 +from keras import regularizers, optimizers
18 +
19 +
20 +def recall(y_true, y_pred):
21 + """Recall metric.
22 +
23 + Only computes a batch-wise average of recall.
24 +
25 + Computes the recall, a metric for multi-label classification of
26 + how many relevant items are selected.
27 + """
28 + true_positives = K.sum(K.round(K.clip(y_true[:, 1] * y_pred[:, 1], 0, 1)))
29 + possible_positives = K.sum(K.round(K.clip(y_true[:, 1], 0, 1)))
30 + recall = true_positives / (possible_positives + K.epsilon())
31 + return recall
32 +
33 +
34 +def precision(y_true, y_pred):
35 + """Precision metric.
36 +
37 + Only computes a batch-wise average of precision.
38 +
39 + Computes the precision, a metric for multi-label classification of
40 + how many selected items are relevant.
41 + """
42 + true_positives = K.sum(K.round(K.clip(y_true[:, 1] * y_pred[:, 1], 0, 1)))
43 + predicted_positives = K.sum(K.round(K.clip(y_pred[:, 1], 0, 1)))
44 + precision = true_positives / (predicted_positives + K.epsilon())
45 + return precision
46 +
47 +
48 +def fbeta_score(y_true, y_pred):
49 +
50 + # If there are no true positives, fix the F score at 0 like sklearn.
51 + if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
52 + return 0
53 +
54 + p = precision(y_true, y_pred)
55 + r = recall(y_true, y_pred)
56 + bb = 1 ** 2
57 + fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
58 + return fbeta_score
59 +
60 +
61 +def main(dataX_path, dataY_path, result_path,
62 + n_epoch, input_dim, days):
63 +
64 + # load data
65 + np.random.seed(2204)
66 + X = np.load(dataX_path)
67 + Y = np.load(dataY_path)
68 +
69 + # build Model
70 + model = Sequential()
71 + model.add(Conv1D(128, 1, activation='relu', input_shape=(days, input_dim)))
72 + model.add(Conv1D(128, 3, activation='relu', padding='same'))
73 + model.add(MaxPooling1D(2))
74 + model.add(Flatten())
75 + model.add(Dropout(0.8))
76 + model.add(Dense(2, activation='softmax'))
77 + adam = optimizers.Adam(lr=0.001)
78 + model.compile(loss='categorical_crossentropy',
79 + optimizer=adam,
80 + metrics=['accuracy', recall, precision, fbeta_score])
81 +
82 + # model Compile
83 + model_name = result_path+'model2_price_move_predict.hdf5'
84 + checkpointer = ModelCheckpoint(filepath=model_name,
85 + monitor='val_fbeta_score',
86 + verbose=2, save_best_only=True)
87 + earlystopper = EarlyStopping(monitor='val_loss', patience=20, verbose=2)
88 +
89 + outmodel = open(result_path+'model2_price_move_predict.json', 'w')
90 + outmodel.write(model.to_json())
91 + outmodel.close()
92 +
93 + # process Training
94 + model.fit(X, Y, batch_size=32, verbose=2,
95 + validation_split=0.1, epochs=n_epoch,
96 + callbacks=[checkpointer])
97 +
98 +
99 +if __name__ == "__main__":
100 + dataX = sys.argv[1]
101 + dataY = sys.argv[2]
102 + model_path = sys.argv[3]
103 + n_epoch = int(sys.argv[4])
104 + input_dim = int(sys.argv[5])
105 + days = int(sys.argv[6])
106 + main(dataX, dataY, model_path, n_epoch, input_dim, days)
1 +#!/usr/bin/env python3
2 +# -*- coding: utf-8 -*-
3 +"""
4 +Created on Thu Apr 13 17:01:36 2017
5 +
6 +@author: red-sky
7 +"""
8 +
9 +import sys
10 +import numpy as np
11 +np.random.seed(280295)
12 +import keras.backend as K
13 +from keras.models import Sequential
14 +from keras import regularizers, optimizers
15 +from keras.layers import Dense, Activation, LSTM, Dropout
16 +from keras.callbacks import ModelCheckpoint, EarlyStopping
17 +
18 +def recall(y_true, y_pred):
19 + """Recall metric.
20 +
21 + Only computes a batch-wise average of recall.
22 +
23 + Computes the recall, a metric for multi-label classification of
24 + how many relevant items are selected.
25 + """
26 + true_positives = K.sum(K.round(K.clip(y_true[:, 0] * y_pred[:, 0], 0, 1)))
27 + possible_positives = K.sum(K.round(K.clip(y_true[:, 0], 0, 1)))
28 + recall = true_positives / (possible_positives + K.epsilon())
29 + return recall
30 +
31 +
32 +def precision(y_true, y_pred):
33 + """Precision metric.
34 +
35 + Only computes a batch-wise average of precision.
36 +
37 + Computes the precision, a metric for multi-label classification of
38 + how many selected items are relevant.
39 + """
40 + true_positives = K.sum(K.round(K.clip(y_true[:, 0] * y_pred[:, 0], 0, 1)))
41 + predicted_positives = K.sum(K.round(K.clip(y_pred[:, 0], 0, 1)))
42 + precision = true_positives / (predicted_positives + K.epsilon())
43 + return precision
44 +
45 +
46 +def fbeta_score(y_true, y_pred):
47 +
48 + # If there are no true positives, fix the F score at 0 like sklearn.
49 + if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
50 + return 0
51 +
52 + p = precision(y_true, y_pred)
53 + r = recall(y_true, y_pred)
54 + bb = 1 ** 2
55 + fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
56 + return fbeta_score
57 +
58 +
59 +def main(dataX_path, dataY_path, result_path,
60 + n_epoch, input_dim, days):
61 +
62 + # load data
63 + np.random.seed(2204)
64 + X = np.load(dataX_path)
65 + Y = np.load(dataY_path)
66 +
67 + # build Model
68 + model = Sequential()
69 + model.add(LSTM(256, input_shape=(days, input_dim),
70 + kernel_regularizer=regularizers.l2(0.001)))
71 +
72 + model.add(Dropout(0.6))
73 + model.add(Dense(2, activation='softmax',
74 + kernel_regularizer=regularizers.l2(0.001)))
75 + adam = optimizers.Adam(lr=0.001)
76 + model.compile(loss='categorical_crossentropy',
77 + optimizer=adam,
78 + metrics=['accuracy', recall, precision, fbeta_score])
79 +
80 + # model Compile
81 + model_name = result_path+'model2_price_move_predict.hdf5'
82 + checkpointer = ModelCheckpoint(filepath=model_name,
83 + monitor='val_fbeta_score', mode="max",
84 + verbose=2, save_best_only=True)
85 + earlystopper = EarlyStopping(monitor='val_loss', patience=20, verbose=2)
86 +
87 + outmodel = open(result_path+'model2_price_move_predict.json', 'w')
88 + outmodel.write(model.to_json())
89 + outmodel.close()
90 +
91 + # process Training
92 + model.fit(X, Y, batch_size=32, verbose=2,
93 + validation_split=0.1, epochs=n_epoch,
94 + callbacks=[checkpointer])
95 +
96 +if __name__ == "__main__":
97 + dataX = sys.argv[1]
98 + dataY = sys.argv[2]
99 + model_path = sys.argv[3]
100 + n_epoch = int(sys.argv[4])
101 + input_dim = int(sys.argv[5])
102 + days = int(sys.argv[6])
103 + main(dataX, dataY, model_path, n_epoch, input_dim, days)
1 +#!/usr/bin/env python3
2 +# -*- coding: utf-8 -*-
3 +"""
4 +Created on Thu Apr 13 17:01:36 2017
5 +
6 +@author: red-sky
7 +"""
8 +
9 +import sys
10 +import numpy as np
11 +np.random.seed(280295)
12 +import keras.backend as K
13 +from keras.models import Sequential
14 +from keras.layers import Dense, Activation, Dropout, Flatten
15 +from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D
16 +from keras.callbacks import ModelCheckpoint, EarlyStopping
17 +from keras import regularizers, optimizers
18 +
19 +
20 +def recall(y_true, y_pred):
21 + """Recall metric.
22 +
23 + Only computes a batch-wise average of recall.
24 +
25 + Computes the recall, a metric for multi-label classification of
26 + how many relevant items are selected.
27 + """
28 + true_positives = K.sum(K.round(K.clip(y_true[:, 0] * y_pred[:, 0], 0, 1)))
29 + possible_positives = K.sum(K.round(K.clip(y_true[:, 0], 0, 1)))
30 + recall = true_positives / (possible_positives + K.epsilon())
31 + return recall
32 +
33 +
34 +def precision(y_true, y_pred):
35 + """Precision metric.
36 +
37 + Only computes a batch-wise average of precision.
38 +
39 + Computes the precision, a metric for multi-label classification of
40 + how many selected items are relevant.
41 + """
42 + true_positives = K.sum(K.round(K.clip(y_true[:, 0] * y_pred[:, 0], 0, 1)))
43 + predicted_positives = K.sum(K.round(K.clip(y_pred[:, 0], 0, 1)))
44 + precision = true_positives / (predicted_positives + K.epsilon())
45 + return precision
46 +
47 +
48 +def fbeta_score(y_true, y_pred):
49 +
50 + # If there are no true positives, fix the F score at 0 like sklearn.
51 + if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
52 + return 0
53 +
54 + p = precision(y_true, y_pred)
55 + r = recall(y_true, y_pred)
56 + bb = 1 ** 2
57 + fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
58 + return fbeta_score
59 +
60 +
61 +def main(dataX_path, dataY_path, result_path,
62 + n_epoch, input_dim, days):
63 +
64 + # load data
65 + np.random.seed(2204)
66 + X = np.load(dataX_path)
67 + Y = np.load(dataY_path)
68 +
69 + # build Model
70 + model = Sequential()
71 + model.add(Flatten(input_shape=(days, input_dim)))
72 + model.add(Dense(512, activation='sigmoid'))
73 + model.add(Dropout(0.8))
74 + model.add(Dense(1024, activation='sigmoid'))
75 + model.add(Dropout(0.8))
76 +# model.add(Dense(1024, activation='sigmoid'))
77 + model.add(Dropout(0.8))
78 + model.add(Dense(2, activation='softmax'))
79 +
80 + adam = optimizers.Adam(lr=0.001)
81 + model.compile(loss='categorical_crossentropy',
82 + optimizer=adam,
83 + metrics=['accuracy', recall, precision, fbeta_score])
84 +
85 + # model Compile
86 + model_name = result_path+'model2_price_move_predict.hdf5'
87 + checkpointer = ModelCheckpoint(filepath=model_name, monitor='val_acc',
88 + verbose=2, save_best_only=True)
89 + earlystopper = EarlyStopping(monitor='val_loss', patience=20, verbose=2)
90 +
91 + outmodel = open(result_path+'model2_price_move_predict.json', 'w')
92 + outmodel.write(model.to_json())
93 + outmodel.close()
94 +
95 + # process Training
96 + model.fit(X, Y, batch_size=32, verbose=2,
97 + validation_split=0.1, epochs=n_epoch,
98 + callbacks=[checkpointer])
99 +
100 +
101 +if __name__ == "__main__":
102 + dataX = sys.argv[1]
103 + dataY = sys.argv[2]
104 + model_path = sys.argv[3]
105 + n_epoch = int(sys.argv[4])
106 + input_dim = int(sys.argv[5])
107 + days = int(sys.argv[6])
108 + main(dataX, dataY, model_path, n_epoch, input_dim, days)
1 +#!/usr/bin/env python3
2 +# -*- coding: utf-8 -*-
3 +"""
4 +Created on Thu Mar 16 21:57:57 2017
5 +
6 +@author: red-sky
7 +"""
8 +
9 +import bs4
10 +import json
11 +import sys
12 +import urllib.request as urlreq
13 +from bs4 import BeautifulSoup
14 +import requests
15 +
16 +BLOOMBERG_params = {
17 + "sort_by_newest": "time:desc",
18 + "sort_by_oldest": "time:asc",
19 + "source_from_bloomberg": "sites=bview",
20 + "end_time": "2017-03-12T15:20:16.240Z"
21 +}
22 +
23 +DATA_TO_EXTRACT = {
24 + "query_list_news": ["div", {"class": "search-result-story__container"}],
25 + "query_headline": ["h1", {"class": "search-result-story__headline"}],
26 + "query_time_published": ["time", {"class": "published-at"}],
27 + "query_body": ["div", {"class": "search-result-story__body"}]
28 +}
29 +
30 +
31 +def parser_url(query_string, page,
32 + sort_by="sort_by_oldest",
33 + source="source_from_bloomberg"):
34 + url = "https://www.bloomberg.com/"
35 + # add search query
36 + url = url + "search?query=" + query_string + "&"
37 + # add sort by
38 + url = url + "sort=" + BLOOMBERG_params[sort_by] + "&"
39 + # add time to query -- use present time
40 + url = url + "sites=" + BLOOMBERG_params[source] + "&"
41 + # add page number
42 + url = url + "page=" + str(page)
43 + return url
44 +
45 +
46 +def get_rid_off_key(list_contents):
47 + body_string = ""
48 + for substring in list_contents:
49 + if (type(substring) == bs4.element.Tag):
50 + # join all body string and
51 + # eliminate highlight query string key
52 + body_string += substring.string
53 + else:
54 + if (type(substring.string) == bs4.element.NavigableString):
55 + body_string += substring.string
56 + return(body_string)
57 +
58 +
59 +def extract_from_url(url):
60 + try:
61 + with requests.get(url) as response:
62 + html_of_page = response.read()
63 + soup_object = BeautifulSoup(html_of_page, "lxml")
64 + # Extract list of news in soup object
65 + param_to_find = DATA_TO_EXTRACT["query_list_news"]
66 + list_of_news = soup_object.find_all(param_to_find[0],
67 + attrs=param_to_find[1])
68 + if (len(list_of_news) == 0):
69 + return None
70 + # create list result extracted
71 + result = []
72 + for block_new in list_of_news:
73 + # extract time from block
74 + param_to_find = DATA_TO_EXTRACT["query_time_published"]
75 + time = block_new.find_all(param_to_find[0],
76 + attrs=param_to_find[1])
77 + time = time[0]["datetime"]
78 +
79 + # extract new headline
80 + param_to_find = DATA_TO_EXTRACT["query_headline"]
81 + headline = block_new.find_all(param_to_find[0],
82 + attrs=param_to_find[1])
83 + headline = get_rid_off_key(headline[0].a.contents)
84 +
85 + # extract new body list if string
86 + param_to_find = DATA_TO_EXTRACT["query_body"]
87 + body = block_new.find_all(param_to_find[0],
88 + attrs=param_to_find[1])
89 + print(body)
90 +
91 + body_string = get_rid_off_key(body[0].contents)
92 + extracted_from_block = {"time": time,
93 + "headline": headline,
94 + "body": body_string}
95 + # for debug :
96 + # print("\t".join(extracted_from_block))
97 + if len(body_string) >= 5:
98 + result.append(extracted_from_block)
99 + except Exception as inst:
100 + print("Something whenwrong :)", inst)
101 + print("ULR: ", url)
102 + result = []
103 + return(result)
104 +
105 +
106 +def Query(key, max_page=5000):
107 + # Init page and looping until return None
108 + page = 1
109 + result = "not None"
110 + all_result_query = []
111 + error = 0
112 + while True and page < max_page:
113 + print("Colected: %d articles" % len(all_result_query))
114 + new_url = parser_url(key, page)
115 + result = extract_from_url(new_url)
116 + if len(result) > 0 or error > 10:
117 + page += 1
118 + error = 0
119 + else:
120 + error += 1
121 +
122 + if result is not None:
123 + all_result_query += result
124 + else:
125 + break
126 + return(all_result_query)
127 +
128 +
129 +if __name__ == "__main__":
130 + print("Begin query information about: ", sys.argv[1])
131 + print("Then will save result in: ", sys.argv[2])
132 +
133 + News = Query(sys.argv[1], int(sys.argv[4]))
134 + file_name1 = sys.argv[2]
135 +
136 + with open(file_name1, "w") as W:
137 + json.dump(News, W, indent=1)
138 +
139 + file_name2 = sys.argv[3]
140 + with open(file_name2, "w") as W:
141 + W.write("\n".join([new["body"] for new in News]))
1 +#!/usr/bin/env python3
2 +# -*- coding: utf-8 -*-
3 +"""
4 +Created on Mon Mar 20 17:52:11 2017
5 +
6 +@author: red-sky
7 +"""
8 +import sys
9 +import json
10 +import numpy as np
11 +
12 +
13 +def updateDict(words, dictUp):
14 + # update word dictionary with given "words" and the dict "dictUp"
15 + for w in words:
16 + if w in dictUp:
17 + dictUp[w] += 1
18 + else:
19 + dictUp[w] = 0
20 + return dictUp
21 +
22 +def extractVocab(eventsFile, fromIndex=0, toIndex="END"):
23 + # from Events file, extract infor about words and create a mapping
24 + vocab = dict()
25 + with open(eventsFile, "r") as file:
26 + list_events = file.read().strip().splitlines()
27 + if toIndex == -1:
28 + list_events = list_events[fromIndex:]
29 + else:
30 + list_events = sorted(set(list_events[fromIndex:toIndex]))
31 + for i, event in enumerate(list_events):
32 + if event[0] != "\t":
33 + index = i
34 + break
35 + list_events = list_events[index:]
36 + for event in list_events:
37 + event = event.split("\t")
38 + words = event[1].split(" ") + \
39 + event[2].split(" ") + \
40 + event[3].split(" ")
41 + vocab = updateDict(words, vocab)
42 + vocab_words = vocab.keys()
43 + support_words = ["NOISEWORDS"]
44 + vocab_words = support_words + \
45 + sorted(vocab_words, key=lambda x: vocab[x], reverse=True)
46 + IndexWords = range(len(vocab_words))
47 + Count = ["NOISEWORDS"] + [vocab[w] for w in vocab_words[1:]]
48 + result = [dict(zip(vocab_words, Count)),
49 + dict(zip(IndexWords, vocab_words)),
50 + dict(zip(vocab_words, IndexWords))]
51 + return result, list_events
52 +
53 +
54 +def convertEvent(eventsFile, vocabMapping, countMin=5):
55 + # convert all Events to index for training
56 + wordCount, _, word2index = vocabMapping
57 + Events = []
58 + with open(eventsFile, "r") as file:
59 + list_events = file.read().strip().splitlines()
60 +
61 + for event in list_events:
62 + event = event.split("\t")
63 + list_obj = [event[1].split(" "),
64 + event[2].split(" "),
65 + event[3].split(" ")]
66 +
67 + # Covert only words that appear more than countMin
68 + wordsIndexed = []
69 + for obj in list_obj:
70 + objIndex = []
71 + for w in obj:
72 + if wordCount[w] >= countMin:
73 + objIndex.append(word2index[w])
74 + else:
75 + objIndex.append(0)
76 + wordsIndexed.append(objIndex)
77 + Events.append(wordsIndexed)
78 + return Events
79 +
80 +
81 +if __name__ == "__main__":
82 + # in
83 + EventPath = "../../Thesis_data/Apple_query_result_body.txt"
84 + fromIndex = 0
85 + toIndex = -1
86 + minCountWord = 5
87 + # out
88 + EventNewPath = "./Events_for_training.txt"
89 + VocabPath = "./Vocab_in_events_for_training.json"
90 + IndexdEventPath = "./IndexedEvents_for_training.npy"
91 +
92 + vocabMapping, EventNew = extractVocab(EventPath, fromIndex, toIndex)
93 + with open(VocabPath, "w") as W:
94 + json.dump(vocabMapping, W, indent=2)
95 +
96 + with open(EventNewPath, "w") as W:
97 + W.write("\n".join(EventNew))
98 +
99 + indexed_events = convertEvent(EventNewPath, vocabMapping, minCountWord)
100 + np.save(arr=np.array(indexed_events), file=IndexdEventPath)
1 +#!/usr/bin/env python3
2 +# -*- coding: utf-8 -*-
3 +"""
4 +Created on Mon Mar 20 11:58:54 2017
5 +
6 +@author: red-sky
7 +"""
8 +
9 +import sys
10 +import json
11 +
12 +
13 +def findDate(news_body, list_news):
14 + date = ""
15 + for ind, new in enumerate(list_news):
16 + if news_body in new["body"]:
17 + date = new["time"]
18 + break
19 + return date
20 +
21 +
22 +def extractAllDate(list_events, list_news, choosedInfor=[1, 2, 3, 0, 6]):
23 + list_result = []
24 + N = len(list_events)
25 + i = 0.0
26 + for event in list_events:
27 + i += 1
28 + if i % 1000 == 0:
29 + print("Done %f percents" % (i/N*100))
30 + date = [findDate(event[6], list_news)]
31 + infor = date + [event[i] for i in choosedInfor]
32 + list_result.append(infor)
33 + return list_result
34 +
35 +if __name__ == "__main__":
36 + events = open(sys.argv[1], "r").read().strip().splitlines()
37 + events = [event.split("\t") for event in events
38 + if len(event.split("\t")) > 5]
39 + news = json.load(open(sys.argv[2], "r"))
40 + result = extractAllDate(events, news)
41 +
42 + with open(sys.argv[3], "w") as W:
43 + for line in result[1:]:
44 + W.write("\t".join(line)+"\n")
1 +#!/usr/bin/env python3
2 +# -*- coding: utf-8 -*-
3 +"""
4 +Created on Thu Apr 13 16:57:11 2017
5 +
6 +@author: red-sky
7 +"""
8 +import sys
9 +import numpy as np
10 +import pickle
11 +import pandas as pd
12 +
13 +
14 +def main(VectorsPath, EventPath, StockPricePath, days):
15 +
16 + with open(VectorsPath, "rb") as H:
17 + Vec = pickle.load(H)
18 + Vectors = np.array([list(b[0]) for a, b in Vec.values()])
19 +# Vectors = np.load(VectorsPath)
20 + with open(EventPath, "r") as H:
21 + F = np.array([a.split("\t")[0:4] for a in H.read().splitlines()])
22 +
23 + D = {}
24 + for date, vec in zip(F[:, 0], Vectors):
25 + if date[:10] in D:
26 + D[date[:10]].append(vec)
27 + else:
28 + D[date[:10]] = [vec]
29 +
30 + D2 = {}
31 + for date in sorted(D.keys()):
32 + D2[date] = np.mean(D[date], 0)
33 +
34 + Dates = np.array(sorted(D2.keys()))
35 + SampleIndex = [list(range(i-days, i)) for i in range(5, len(Dates))]
36 + DataX = []
37 + DateX = []
38 + for listIndex in SampleIndex:
39 + DataX.append([D2[date] for date in Dates[listIndex]])
40 + DateX.append(Dates[listIndex[-1]])
41 +
42 + Df = pd.read_csv(StockPricePath)
43 + LabelY = []
44 + DataX_yesData = []
45 + for i, date in enumerate(DateX):
46 + retu = list(Df.loc[Df["Date"] == date]["ReturnOpen"])
47 + print(retu)
48 + if len(retu) > 0:
49 + retu = float(retu[0])*100
50 + if retu > 0:
51 + LabelY.append([1, 0])
52 + if retu < -0:
53 + LabelY.append([0, 1])
54 + if retu <= 0 and retu >= -0:
55 + LabelY.append([0, 1])
56 + DataX_yesData.append(list(DataX[i]))
57 + print(date)
58 +# else:
59 +
60 + dataX = np.array(DataX_yesData)
61 + dataY = np.array(LabelY)
62 + print("DataX:", dataX.shape)
63 + print("DataY:", dataY.shape, np.sum(dataY, 0) / np.sum(dataY))
64 + return (dataX, dataY)
65 +
66 +if __name__ == "__main__":
67 + VectorsPath = sys.argv[1]
68 + EventPath = sys.argv[2]
69 + StockPricePath = sys.argv[3]
70 + days = int(sys.argv[5])
71 + DataX, LabelY = main(VectorsPath, EventPath, StockPricePath, days)
72 + DataPath = sys.argv[4]
73 + np.save(arr=DataX, file=DataPath+"/DailyVector" + sys.argv[5] + ".npy")
74 + np.save(arr=LabelY, file=DataPath+"/DailyReturn" + sys.argv[5] + ".npy")