Add : Event Embedding code & data, 최종 보고서 수정

최재은
Commit ac58ec2bf419487edd6d2fac853bd293836682f1 ac58ec2b 1 parent f203937f
Showing 41 changed files with 984 additions and 0 deletions
소스코드/Bert_CNN_English.ipynb
소스코드/Korean pre processing.ipynb
소스코드/data/apple_combined_data.csv
소스코드/data/apple_combined_data2015 .csv
소스코드/event embedding/Data/APPL_stock_price.csv
소스코드/event embedding/Data/Apple_query_result_body.txt
소스코드/event embedding/Data/EventNew.txt
소스코드/event embedding/Data/IndexedEvents.npy
소스코드/event embedding/Data/ModelEvents_result/DailyReturn.npy
소스코드/event embedding/Data/ModelEvents_result/DailyReturn30.npy
소스코드/event embedding/Data/ModelEvents_result/DailyVector.npy
소스코드/event embedding/Data/ModelEvents_result/DailyVector30.npy
소스코드/event embedding/Data/ModelEvents_result/TrainedParams.pickle
소스코드/event embedding/Data/ModelEvents_result/resultEmbeding.pickle
소스코드/event embedding/Data/ModelEvents_result/test.py
소스코드/event embedding/Data/ModelStock_result/Base_Model_1/model2_price_move_predict.hdf5
소스코드/event embedding/Data/ModelStock_result/Base_Model_1/model2_price_move_predict.json
소스코드/event embedding/Data/ModelStock_result/Base_Model_2/model2_price_move_predict.hdf5
소스코드/event embedding/Data/ModelStock_result/Base_Model_2/model2_price_move_predict.json
소스코드/event embedding/Data/ModelStock_result/Proposed_Model/model2_price_move_predict.hdf5
--- a/소스코드/Bert_CNN_English.ipynb 0 → 100644
View file @ac58ec2
+++ b/소스코드/Bert_CNN_English.ipynb 0 → 100644
View file @ac58ec2
--- a/소스코드/Korean pre processing.ipynb 0 → 100644
View file @ac58ec2
+++ b/소스코드/Korean pre processing.ipynb 0 → 100644
View file @ac58ec2
--- a/소스코드/data/apple_combined_data.csv 0 → 100644
View file @ac58ec2
+++ b/소스코드/data/apple_combined_data.csv 0 → 100644
View file @ac58ec2
--- a/소스코드/data/apple_combined_data2015 .csv 0 → 100644
View file @ac58ec2
+++ b/소스코드/data/apple_combined_data2015 .csv 0 → 100644
View file @ac58ec2
--- a/소스코드/event embedding/Data/APPL_stock_price.csv 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Data/APPL_stock_price.csv 0 → 100644
View file @ac58ec2
--- a/소스코드/event embedding/Data/Apple_query_result_body.txt 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Data/Apple_query_result_body.txt 0 → 100644
View file @ac58ec2
--- a/소스코드/event embedding/Data/EventNew.txt 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Data/EventNew.txt 0 → 100644
View file @ac58ec2
--- a/소스코드/event embedding/Data/IndexedEvents.npy 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Data/IndexedEvents.npy 0 → 100644
View file @ac58ec2
--- a/소스코드/event embedding/Data/ModelEvents_result/DailyReturn.npy 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Data/ModelEvents_result/DailyReturn.npy 0 → 100644
View file @ac58ec2
--- a/소스코드/event embedding/Data/ModelEvents_result/DailyReturn30.npy 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Data/ModelEvents_result/DailyReturn30.npy 0 → 100644
View file @ac58ec2
--- a/소스코드/event embedding/Data/ModelEvents_result/DailyVector.npy 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Data/ModelEvents_result/DailyVector.npy 0 → 100644
View file @ac58ec2
--- a/소스코드/event embedding/Data/ModelEvents_result/DailyVector30.npy 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Data/ModelEvents_result/DailyVector30.npy 0 → 100644
View file @ac58ec2
--- a/소스코드/event embedding/Data/ModelEvents_result/TrainedParams.pickle 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Data/ModelEvents_result/TrainedParams.pickle 0 → 100644
View file @ac58ec2
--- a/소스코드/event embedding/Data/ModelEvents_result/resultEmbeding.pickle 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Data/ModelEvents_result/resultEmbeding.pickle 0 → 100644
View file @ac58ec2
--- a/소스코드/event embedding/Data/ModelEvents_result/test.py 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Data/ModelEvents_result/test.py 0 → 100644
View file @ac58ec2
+ import numpy as np
+ 
+ print(np.load("./resultEmbeding.pickle", allow_pickle=True))
+ D
\ No newline at end of file
--- a/소스코드/event embedding/Data/ModelStock_result/Base_Model_1/model2_price_move_predict.hdf5 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Data/ModelStock_result/Base_Model_1/model2_price_move_predict.hdf5 0 → 100644
View file @ac58ec2
--- a/소스코드/event embedding/Data/ModelStock_result/Base_Model_1/model2_price_move_predict.json 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Data/ModelStock_result/Base_Model_1/model2_price_move_predict.json 0 → 100644
View file @ac58ec2
+ {"class_name": "Sequential", "config": [{"class_name": "Flatten", "config": {"name": "flatten_1", "trainable": true, "batch_input_shape": [null, 5, 80], "dtype": "float32"}}, {"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "units": 512, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "rate": 0.8}}, {"class_name": "Dense", "config": {"name": "dense_2", "trainable": true, "units": 1024, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout_2", "trainable": true, "rate": 0.8}}, {"class_name": "Dropout", "config": {"name": "dropout_3", "trainable": true, "rate": 0.8}}, {"class_name": "Dense", "config": {"name": "dense_3", "trainable": true, "units": 2, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}], "keras_version": "2.0.2", "backend": "theano"}
\ No newline at end of file
--- a/소스코드/event embedding/Data/ModelStock_result/Base_Model_2/model2_price_move_predict.hdf5 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Data/ModelStock_result/Base_Model_2/model2_price_move_predict.hdf5 0 → 100644
View file @ac58ec2
--- a/소스코드/event embedding/Data/ModelStock_result/Base_Model_2/model2_price_move_predict.json 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Data/ModelStock_result/Base_Model_2/model2_price_move_predict.json 0 → 100644
View file @ac58ec2
+ {"class_name": "Sequential", "config": [{"class_name": "Conv1D", "config": {"name": "conv1d_1", "trainable": true, "batch_input_shape": [null, 5, 80], "dtype": "float32", "filters": 128, "kernel_size": [1], "strides": [1], "padding": "valid", "dilation_rate": [1], "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Conv1D", "config": {"name": "conv1d_2", "trainable": true, "filters": 128, "kernel_size": [3], "strides": [1], "padding": "same", "dilation_rate": [1], "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "MaxPooling1D", "config": {"name": "max_pooling1d_1", "trainable": true, "strides": [2], "pool_size": [2], "padding": "valid"}}, {"class_name": "Flatten", "config": {"name": "flatten_1", "trainable": true}}, {"class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "rate": 0.8}}, {"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "units": 2, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}], "keras_version": "2.0.2", "backend": "theano"}
\ No newline at end of file
--- a/소스코드/event embedding/Data/ModelStock_result/Proposed_Model/model2_price_move_predict.hdf5 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Data/ModelStock_result/Proposed_Model/model2_price_move_predict.hdf5 0 → 100644
View file @ac58ec2
--- a/소스코드/event embedding/Data/ModelStock_result/Proposed_Model/model2_price_move_predict.json 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Data/ModelStock_result/Proposed_Model/model2_price_move_predict.json 0 → 100644
View file @ac58ec2
+ {"class_name": "Sequential", "config": [{"class_name": "LSTM", "config": {"name": "lstm_1", "trainable": true, "batch_input_shape": [null, 5, 80], "dtype": "float32", "return_sequences": false, "go_backwards": false, "stateful": false, "unroll": false, "implementation": 0, "units": 256, "activation": "tanh", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": {"class_name": "L1L2", "config": {"l1": 0.0, "l2": 0.0010000000474974513}}, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0}}, {"class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "rate": 0.6}}, {"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "units": 2, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": {"class_name": "L1L2", "config": {"l1": 0.0, "l2": 0.0010000000474974513}}, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}], "keras_version": "2.0.2", "backend": "theano"}
\ No newline at end of file
--- a/소스코드/event embedding/Data/Processed_Events.txt 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Data/Processed_Events.txt 0 → 100644
View file @ac58ec2
--- a/소스코드/event embedding/Data/Result_query_Apple.json 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Data/Result_query_Apple.json 0 → 100644
View file @ac58ec2
--- a/소스코드/event embedding/Data/Vocab.txt 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Data/Vocab.txt 0 → 100644
View file @ac58ec2
--- a/소스코드/event embedding/Data/data_stock.dat 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Data/data_stock.dat 0 → 100644
View file @ac58ec2
--- a/소스코드/event embedding/Data/data_stock_2.dat 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Data/data_stock_2.dat 0 → 100644
View file @ac58ec2
--- a/소스코드/event embedding/Model_EventEmbeding/EmbeddingLayer.py 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Model_EventEmbeding/EmbeddingLayer.py 0 → 100644
View file @ac58ec2
+ #!/usr/bin/env python3
+ # -*- coding: utf-8 -*-
+ """
+ Created on Mon Mar 20 23:41:51 2017
+ 
+ @author: red-sky
+ """
+ 
+ 
+ import numpy as np
+ import theano
+ from theano import tensor as T
+ 
+ 
+ class EmbeddingLayer(object):
+     def __init__(self, num_vocab, word_dim, rng, embedding_w=None):
+         '''
+         word_dim :: dimension of the word embeddings
+         num_vocab :: number of word embeddings in the vocabulary
+         embedding_w :: pre-train word vector
+         '''
+ 
+         if embedding_w is None:
+             word_vectors = rng.uniform(-1.0, 1.0, (num_vocab, word_dim))
+             self.embedding_w = theano.shared(word_vectors,
+                                              name="EmbeddingLayer_W") \
+                 .astype(theano.config.floatX)
+         else:
+             self.embedding_w = theano.shared(embedding_w,
+                                              name="EmbeddingLayer_W") \
+                 .astype(theano.config.floatX)
+ 
+         self.params = [self.embedding_w]
+         self.infor = [num_vocab, word_dim]
+ 
+     def words_ind_2vec(self, index):
+         map_word_vectors = self.embedding_w[index]
+         output = T.mean(map_word_vectors,  axis=0)
+         return output, map_word_vectors
+ 
+ 
+ if __name__ == "__main__":
+     rng = np.random.RandomState(220495)
+     arrWords = T.ivector("words")
+     EMBD = EmbeddingLayer(100, 150, rng=rng)
+     Word2Vec = theano.function(
+         inputs=[arrWords],
+         outputs=EMBD.words_ind_2vec(arrWords)
+     )
+     Vec = Word2Vec([1, 2, 3, 4])
+     Vec = Word2Vec([2, 3, 4])
+     print("Dim: ", Vec.shape)
+     print("Val: ", Vec)
--- a/소스코드/event embedding/Model_EventEmbeding/EventEmbedding.py 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Model_EventEmbeding/EventEmbedding.py 0 → 100644
View file @ac58ec2
--- a/소스코드/event embedding/Model_EventEmbeding/RoleDependentLayer.py 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Model_EventEmbeding/RoleDependentLayer.py 0 → 100644
View file @ac58ec2
+ #!/usr/bin/env python3
+ # -*- coding: utf-8 -*-
+ """
+ Created on Sat Mar 25 16:13:18 2017
+ 
+ @author: red-sky
+ """
+ 
+ import theano
+ import numpy as np
+ import theano.tensor as T
+ from SmallUtils import createShareVar
+ 
+ 
+ class RoleDependentLayer(object):
+     def __init__(self, left_dependent, right_dependent, rng,
+                  n_in=100, n_out=4, trainedParams=None,
+                  name="RoleDependentEmbedding_"):
+         if trainedParams is None:
+             trainedParams = {
+                 name: {
+                     "T": None, "W1": None, "W2": None, "b": None
+                 }
+             }
+ 
+         if trainedParams[name]["T"] is not None:
+             assert trainedParams[name]["T"].shape == (n_out, n_in, n_in)
+             self.T = theano.shared(value=trainedParams[name]["T"],
+                                    name=name+"T", borrow=True)
+         else:
+             self.T = createShareVar(rng=rng, name=name+"T",
+                                     factor_for_init=n_out + n_in,
+                                     dim=(n_out, n_in, n_in))
+ 
+         if trainedParams[name]["W1"] is not None:
+             assert trainedParams[name]["W1"].shape == (n_in, n_out)
+             self.W1 = theano.shared(value=trainedParams[name]["W1"],
+                                     name=name+"W1", borrow=True)
+         else:
+             self.W1 = createShareVar(rng=rng, name=name+"W1",
+                                      factor_for_init=n_out + n_in,
+                                      dim=(n_in, n_out))
+ 
+         if trainedParams[name]["W2"] is not None:
+             assert trainedParams[name]["W2"].shape == (n_in, n_out)
+             self.W2 = theano.shared(value=trainedParams[name]["W2"],
+                                     name=name+"W2", borrow=True)
+         else:
+             self.W2 = createShareVar(rng=rng, name=name+"W2",
+                                      factor_for_init=n_out + n_in,
+                                      dim=(n_in, n_out))
+ 
+         if trainedParams[name]["b"] is not None:
+             assert trainedParams[name]["b"].shape == (n_out,)
+             self.b = theano.shared(value=trainedParams[name]["b"],
+                                    name=name+"b", borrow=True)
+         else:
+             b_values = np.zeros(shape=(n_out,), dtype=theano.config.floatX)
+             self.b = theano.shared(value=b_values, name=name+"b", borrow=True)
+ 
+         # list of layer params
+         self.params = [self.T, self.W1, self.W2, self.b]
+ 
+         # L2 regulation
+         self.L2 = sum([(param**2).sum() for param in self.params])
+ 
+         # Bi-linear step
+         def one_kernel(Tk, left, right):
+             first_bi_libear = theano.dot(left, Tk)
+             seccon_bi_linear = theano.dot(first_bi_libear, right)
+             return(seccon_bi_linear.flatten())
+ 
+         bi_1, _ = theano.scan(
+             fn=one_kernel,
+             sequences=[self.T],
+             non_sequences=[left_dependent, right_dependent],
+             n_steps=n_out
+         )
+ 
+         # Feed forward network step
+         feedforward_step1 = theano.dot(left_dependent, self.W1)
+         feedforward_step2 = theano.dot(right_dependent, self.W2)
+         feedforward_step3 = (feedforward_step1 +
+                              feedforward_step2.dimshuffle("x", 0) +
+                              self.b.dimshuffle("x", 0))
+         feedforward_step4 = bi_1.dimshuffle(1, 0) + feedforward_step3
+         self.output = theano.tensor.tanh(feedforward_step4)
+         self.test = [feedforward_step3]
+ 
+     def output_(self, left_dependent, right_dependent):
+ 
+         def one_kernel(Tk, left, right):
+             first_bi_libear = theano.dot(left, Tk)
+             seccon_bi_linear = theano.dot(first_bi_libear, right)
+             return(seccon_bi_linear.flatten())
+ 
+         bi_linear_tensor, _ = theano.scan(
+             fn=one_kernel,
+             sequences=[self.T],
+             non_sequences=[left_dependent, right_dependent],
+             n_steps=n_out
+         )
+ 
+         bi_linear_tensor = bi_linear_tensor.dimshuffle(1, 0)
+         feedforward_step1 = theano.dot(left_dependent, self.W1)
+         feedforward_step2 = theano.dot(right_dependent, self.W2)
+         feedforward_step3 = (feedforward_step1 +
+                              feedforward_step2.dimshuffle("x", 0) +
+                              self.b.dimshuffle("x", 0))
+         feedforward_step4 = bi_linear_tensor + feedforward_step3
+         output = theano.tensor.tanh(feedforward_step4)
+         return(output)
+ 
+     def get_params(self):
+         trainedParams = {
+             "T": self.T.get_value(), "W1": self.W1.get_value(),
+             "W2": self.W2.get_value(), "b": self.b.get_value()
+         }
+         return(trainedParams)
--- a/소스코드/event embedding/Model_EventEmbeding/SmallUtils.py 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Model_EventEmbeding/SmallUtils.py 0 → 100644
View file @ac58ec2
+ #!/usr/bin/env python3
+ # -*- coding: utf-8 -*-
+ """
+ Created on Sat Mar 25 15:55:14 2017
+ 
+ @author: red-sky
+ """
+ import theano
+ import theano.tensor as T
+ import numpy as np
+ 
+ def createShareVar(rng, dim, name, factor_for_init):
+     var_values = np.asarray(
+         rng.uniform(
+             low=-np.sqrt(6.0 / factor_for_init),
+             high=np.sqrt(6.0 / factor_for_init),
+             size=dim,
+         )
+     )
+     Var = theano.shared(value=var_values, name=name, borrow=True)
+     return Var
+ 
+ 
+ def adadelta(lr, tparams, cost, grads, listInput):
+     """
+     An adaptive learning rate optimizer
+ 
+     Parameters
+     ----------
+     lr : Theano SharedVariable
+         Initial learning rate
+     tpramas: Theano SharedVariable
+         Model parameters
+     grads: Theano variable
+         Gradients of cost w.r.t to parameres
+ 
+     cost: Theano variable
+         Objective fucntion to minimize
+ 
+     Notes
+     -----
+     For more information, see [ADADELTA]_.
+ 
+     .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
+        Rate Method*, arXiv:1212.5701.
+     """
+     np_float = np.asarray(0., dtype=theano.config.floatX)
+     zipped_grads = [theano.shared(p.get_value() * np_float,
+                                   name='%s_grad' % k)
+                     for k, p in enumerate(tparams)]
+     running_up2 = [theano.shared(p.get_value() * np_float,
+                                  name='%s_rup2' % k)
+                    for k, p in enumerate(tparams)]
+     running_grads2 = [theano.shared(p.get_value() * np_float,
+                                     name='%s_rgrad2' % k)
+                       for k, p in enumerate(tparams)]
+ 
+     zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
+     rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
+              for rg2, g in zip(running_grads2, grads)]
+ 
+     f_grad_shared = theano.function(inputs=listInput,
+                                     outputs=cost,
+                                     updates=zgup + rg2up,
+                                     name='adadelta_f_grad_shared')
+ 
+     updir = [-T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg
+              for zg, ru2, rg2 in zip(zipped_grads,
+                                      running_up2,
+                                      running_grads2)]
+     ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
+              for ru2, ud in zip(running_up2, updir)]
+     param_up = [(p, p + ud) for p, ud in zip(tparams, updir)]
+ 
+     f_update = theano.function([lr], [], updates=ru2up + param_up,
+                                on_unused_input='ignore',
+                                name='adadelta_f_update')
+ 
+     return f_grad_shared, f_update
+ 
+ 
+ def ADAM_OPTIMIZER(loss, all_params, learning_rate=0.001,
+                    b1=0.9, b2=0.999, e=1e-8, gamma=1-1e-8):
+     """
+     CITE: http://sebastianruder.com/optimizing-gradient-descent/index.html#adam
+     ADAM update rules
+     Default values are taken from [Kingma2014]
+     References:
+     [Kingma2014] Kingma, Diederik, and Jimmy Ba.
+     "Adam: A Method for Stochastic Optimization."
+     arXiv preprint arXiv:1412.6980 (2014).
+     http://arxiv.org/pdf/1412.6980v4.pdf
+     """
+     updates = []
+     all_grads = theano.grad(loss, all_params)
+     alpha = learning_rate
+     t = theano.shared(np.float32(1))
+     # (Decay the first moment running average coefficient)
+     b1_t = b1*gamma**(t-1)
+ 
+     for params_previous, g in zip(all_params, all_grads):
+         init_moment = np.zeros(params_previous.get_value().shape,
+                                dtype=theano.config.floatX)
+         # (the mean)
+         first_moment = theano.shared(init_moment)
+         # (the uncentered variance)
+         second_moment = theano.shared(init_moment)
+ 
+         # (Update biased first moment estimate)
+         bias_m = b1_t*first_moment + (1 - b1_t)*g
+ 
+         # (Update biased second raw moment estimate)
+         bias_v = b2*second_moment + (1 - b2)*g**2
+ 
+         # (Compute bias-corrected first moment estimate)
+         unbias_m = bias_m / (1-b1**t)
+ 
+         # (Compute bias-corrected second raw moment estimate)
+         unbias_v = bias_v / (1-b2**t)
+ 
+         # (Update parameters)
+         update_term = (alpha * unbias_m) / (T.sqrt(unbias_v) + e)
+         params_new = params_previous - update_term
+ 
+         updates.append((first_moment, bias_m))
+         updates.append((second_moment, bias_v))
+         updates.append((params_previous, params_new))
+     updates.append((t, t + 1.))
+     return updates
\ No newline at end of file
--- a/소스코드/event embedding/Model_EventEmbeding/__pycache__/EmbeddingLayer.cpython-36.pyc 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Model_EventEmbeding/__pycache__/EmbeddingLayer.cpython-36.pyc 0 → 100644
View file @ac58ec2
--- a/소스코드/event embedding/Model_EventEmbeding/__pycache__/RoleDependentLayer.cpython-36.pyc 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Model_EventEmbeding/__pycache__/RoleDependentLayer.cpython-36.pyc 0 → 100644
View file @ac58ec2
--- a/소스코드/event embedding/Model_EventEmbeding/__pycache__/SmallUtils.cpython-36.pyc 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Model_EventEmbeding/__pycache__/SmallUtils.cpython-36.pyc 0 → 100644
View file @ac58ec2
--- a/소스코드/event embedding/Model_StockMovePredict/train_model_CNN.py 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Model_StockMovePredict/train_model_CNN.py 0 → 100644
View file @ac58ec2
+ #!/usr/bin/env python3
+ # -*- coding: utf-8 -*-
+ """
+ Created on Thu Apr 13 17:01:36 2017
+ 
+ @author: red-sky
+ """
+ 
+ import sys
+ import numpy as np
+ np.random.seed(280295)
+ import keras.backend as K
+ from keras.models import Sequential
+ from keras.layers import Dense, Activation, Dropout, Flatten
+ from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D
+ from keras.callbacks import ModelCheckpoint, EarlyStopping
+ from keras import regularizers, optimizers
+ 
+ 
+ def recall(y_true, y_pred):
+     """Recall metric.
+ 
+     Only computes a batch-wise average of recall.
+ 
+     Computes the recall, a metric for multi-label classification of
+     how many relevant items are selected.
+     """
+     true_positives = K.sum(K.round(K.clip(y_true[:, 1] * y_pred[:, 1], 0, 1)))
+     possible_positives = K.sum(K.round(K.clip(y_true[:, 1], 0, 1)))
+     recall = true_positives / (possible_positives + K.epsilon())
+     return recall
+ 
+ 
+ def precision(y_true, y_pred):
+     """Precision metric.
+ 
+     Only computes a batch-wise average of precision.
+ 
+     Computes the precision, a metric for multi-label classification of
+     how many selected items are relevant.
+     """
+     true_positives = K.sum(K.round(K.clip(y_true[:, 1] * y_pred[:, 1], 0, 1)))
+     predicted_positives = K.sum(K.round(K.clip(y_pred[:, 1], 0, 1)))
+     precision = true_positives / (predicted_positives + K.epsilon())
+     return precision
+ 
+ 
+ def fbeta_score(y_true, y_pred):
+ 
+     # If there are no true positives, fix the F score at 0 like sklearn.
+     if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
+         return 0
+ 
+     p = precision(y_true, y_pred)
+     r = recall(y_true, y_pred)
+     bb = 1 ** 2
+     fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
+     return fbeta_score
+ 
+ 
+ def main(dataX_path, dataY_path, result_path,
+          n_epoch, input_dim, days):
+ 
+     # load data
+     np.random.seed(2204)
+     X = np.load(dataX_path)
+     Y = np.load(dataY_path)
+ 
+     # build Model
+     model = Sequential()
+     model.add(Conv1D(128, 1, activation='relu', input_shape=(days, input_dim)))
+     model.add(Conv1D(128, 3, activation='relu', padding='same'))
+     model.add(MaxPooling1D(2))
+     model.add(Flatten())
+     model.add(Dropout(0.8))
+     model.add(Dense(2, activation='softmax'))
+     adam = optimizers.Adam(lr=0.001)
+     model.compile(loss='categorical_crossentropy',
+                   optimizer=adam,
+                   metrics=['accuracy', recall, precision, fbeta_score])
+ 
+     # model Compile
+     model_name = result_path+'model2_price_move_predict.hdf5'
+     checkpointer = ModelCheckpoint(filepath=model_name,
+                                    monitor='val_fbeta_score',
+                                    verbose=2, save_best_only=True)
+     earlystopper = EarlyStopping(monitor='val_loss', patience=20, verbose=2)
+ 
+     outmodel = open(result_path+'model2_price_move_predict.json', 'w')
+     outmodel.write(model.to_json())
+     outmodel.close()
+ 
+     # process Training
+     model.fit(X, Y, batch_size=32, verbose=2,
+               validation_split=0.1, epochs=n_epoch,
+               callbacks=[checkpointer])
+ 
+ 
+ if __name__ == "__main__":
+     dataX = sys.argv[1]
+     dataY = sys.argv[2]
+     model_path = sys.argv[3]
+     n_epoch = int(sys.argv[4])
+     input_dim = int(sys.argv[5])
+     days = int(sys.argv[6])
+     main(dataX, dataY, model_path, n_epoch, input_dim, days)
--- a/소스코드/event embedding/Model_StockMovePredict/train_model_LSTM.py 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Model_StockMovePredict/train_model_LSTM.py 0 → 100644
View file @ac58ec2
+ #!/usr/bin/env python3
+ # -*- coding: utf-8 -*-
+ """
+ Created on Thu Apr 13 17:01:36 2017
+ 
+ @author: red-sky
+ """
+ 
+ import sys
+ import numpy as np
+ np.random.seed(280295)
+ import keras.backend as K
+ from keras.models import Sequential
+ from keras import regularizers, optimizers
+ from keras.layers import Dense, Activation, LSTM, Dropout
+ from keras.callbacks import ModelCheckpoint, EarlyStopping
+ 
+ def recall(y_true, y_pred):
+     """Recall metric.
+ 
+     Only computes a batch-wise average of recall.
+ 
+     Computes the recall, a metric for multi-label classification of
+     how many relevant items are selected.
+     """
+     true_positives = K.sum(K.round(K.clip(y_true[:, 0] * y_pred[:, 0], 0, 1)))
+     possible_positives = K.sum(K.round(K.clip(y_true[:, 0], 0, 1)))
+     recall = true_positives / (possible_positives + K.epsilon())
+     return recall
+ 
+ 
+ def precision(y_true, y_pred):
+     """Precision metric.
+ 
+     Only computes a batch-wise average of precision.
+ 
+     Computes the precision, a metric for multi-label classification of
+     how many selected items are relevant.
+     """
+     true_positives = K.sum(K.round(K.clip(y_true[:, 0] * y_pred[:, 0], 0, 1)))
+     predicted_positives = K.sum(K.round(K.clip(y_pred[:, 0], 0, 1)))
+     precision = true_positives / (predicted_positives + K.epsilon())
+     return precision
+ 
+ 
+ def fbeta_score(y_true, y_pred):
+ 
+     # If there are no true positives, fix the F score at 0 like sklearn.
+     if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
+         return 0
+ 
+     p = precision(y_true, y_pred)
+     r = recall(y_true, y_pred)
+     bb = 1 ** 2
+     fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
+     return fbeta_score
+ 
+ 
+ def main(dataX_path, dataY_path, result_path,
+          n_epoch, input_dim, days):
+ 
+     # load data
+     np.random.seed(2204)
+     X = np.load(dataX_path)
+     Y = np.load(dataY_path)
+ 
+     # build Model
+     model = Sequential()
+     model.add(LSTM(256, input_shape=(days, input_dim),
+                    kernel_regularizer=regularizers.l2(0.001)))
+ 
+     model.add(Dropout(0.6))
+     model.add(Dense(2, activation='softmax',
+                     kernel_regularizer=regularizers.l2(0.001)))
+     adam = optimizers.Adam(lr=0.001)
+     model.compile(loss='categorical_crossentropy',
+                   optimizer=adam,
+                   metrics=['accuracy', recall, precision, fbeta_score])
+ 
+     # model Compile
+     model_name = result_path+'model2_price_move_predict.hdf5'
+     checkpointer = ModelCheckpoint(filepath=model_name,
+                                    monitor='val_fbeta_score', mode="max",
+                                    verbose=2, save_best_only=True)
+     earlystopper = EarlyStopping(monitor='val_loss', patience=20, verbose=2)
+ 
+     outmodel = open(result_path+'model2_price_move_predict.json', 'w')
+     outmodel.write(model.to_json())
+     outmodel.close()
+ 
+     # process Training
+     model.fit(X, Y, batch_size=32, verbose=2,
+               validation_split=0.1, epochs=n_epoch,
+               callbacks=[checkpointer])
+ 
+ if __name__ == "__main__":
+     dataX = sys.argv[1]
+     dataY = sys.argv[2]
+     model_path = sys.argv[3]
+     n_epoch = int(sys.argv[4])
+     input_dim = int(sys.argv[5])
+     days = int(sys.argv[6])
+     main(dataX, dataY, model_path, n_epoch, input_dim, days)
--- a/소스코드/event embedding/Model_StockMovePredict/train_model_fullly_connected.py 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Model_StockMovePredict/train_model_fullly_connected.py 0 → 100644
View file @ac58ec2
+ #!/usr/bin/env python3
+ # -*- coding: utf-8 -*-
+ """
+ Created on Thu Apr 13 17:01:36 2017
+ 
+ @author: red-sky
+ """
+ 
+ import sys
+ import numpy as np
+ np.random.seed(280295)
+ import keras.backend as K
+ from keras.models import Sequential
+ from keras.layers import Dense, Activation, Dropout, Flatten
+ from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D
+ from keras.callbacks import ModelCheckpoint, EarlyStopping
+ from keras import regularizers, optimizers
+ 
+ 
+ def recall(y_true, y_pred):
+     """Recall metric.
+ 
+     Only computes a batch-wise average of recall.
+ 
+     Computes the recall, a metric for multi-label classification of
+     how many relevant items are selected.
+     """
+     true_positives = K.sum(K.round(K.clip(y_true[:, 0] * y_pred[:, 0], 0, 1)))
+     possible_positives = K.sum(K.round(K.clip(y_true[:, 0], 0, 1)))
+     recall = true_positives / (possible_positives + K.epsilon())
+     return recall
+ 
+ 
+ def precision(y_true, y_pred):
+     """Precision metric.
+ 
+     Only computes a batch-wise average of precision.
+ 
+     Computes the precision, a metric for multi-label classification of
+     how many selected items are relevant.
+     """
+     true_positives = K.sum(K.round(K.clip(y_true[:, 0] * y_pred[:, 0], 0, 1)))
+     predicted_positives = K.sum(K.round(K.clip(y_pred[:, 0], 0, 1)))
+     precision = true_positives / (predicted_positives + K.epsilon())
+     return precision
+ 
+ 
+ def fbeta_score(y_true, y_pred):
+ 
+     # If there are no true positives, fix the F score at 0 like sklearn.
+     if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
+         return 0
+ 
+     p = precision(y_true, y_pred)
+     r = recall(y_true, y_pred)
+     bb = 1 ** 2
+     fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
+     return fbeta_score
+ 
+ 
+ def main(dataX_path, dataY_path, result_path,
+          n_epoch, input_dim, days):
+ 
+     # load data
+     np.random.seed(2204)
+     X = np.load(dataX_path)
+     Y = np.load(dataY_path)
+ 
+     # build Model
+     model = Sequential()
+     model.add(Flatten(input_shape=(days, input_dim)))
+     model.add(Dense(512, activation='sigmoid'))
+     model.add(Dropout(0.8))
+     model.add(Dense(1024, activation='sigmoid'))
+     model.add(Dropout(0.8))
+ #    model.add(Dense(1024, activation='sigmoid'))
+     model.add(Dropout(0.8))
+     model.add(Dense(2, activation='softmax'))
+ 
+     adam = optimizers.Adam(lr=0.001)
+     model.compile(loss='categorical_crossentropy',
+                   optimizer=adam,
+                   metrics=['accuracy', recall, precision, fbeta_score])
+ 
+     # model Compile
+     model_name = result_path+'model2_price_move_predict.hdf5'
+     checkpointer = ModelCheckpoint(filepath=model_name, monitor='val_acc',
+                                    verbose=2, save_best_only=True)
+     earlystopper = EarlyStopping(monitor='val_loss', patience=20, verbose=2)
+ 
+     outmodel = open(result_path+'model2_price_move_predict.json', 'w')
+     outmodel.write(model.to_json())
+     outmodel.close()
+ 
+     # process Training
+     model.fit(X, Y, batch_size=32, verbose=2,
+               validation_split=0.1, epochs=n_epoch,
+               callbacks=[checkpointer])
+ 
+ 
+ if __name__ == "__main__":
+     dataX = sys.argv[1]
+     dataY = sys.argv[2]
+     model_path = sys.argv[3]
+     n_epoch = int(sys.argv[4])
+     input_dim = int(sys.argv[5])
+     days = int(sys.argv[6])
+     main(dataX, dataY, model_path, n_epoch, input_dim, days)
--- a/소스코드/event embedding/Pre-Process/craw_headline_news_from_bloomberg.py 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Pre-Process/craw_headline_news_from_bloomberg.py 0 → 100644
View file @ac58ec2
+ #!/usr/bin/env python3
+ # -*- coding: utf-8 -*-
+ """
+ Created on Thu Mar 16 21:57:57 2017
+ 
+ @author: red-sky
+ """
+ 
+ import bs4
+ import json
+ import sys
+ import urllib.request as urlreq
+ from bs4 import BeautifulSoup
+ import requests
+ 
+ BLOOMBERG_params = {
+     "sort_by_newest": "time:desc",
+     "sort_by_oldest": "time:asc",
+     "source_from_bloomberg": "sites=bview",
+     "end_time": "2017-03-12T15:20:16.240Z"
+ }
+ 
+ DATA_TO_EXTRACT = {
+     "query_list_news": ["div", {"class": "search-result-story__container"}],
+     "query_headline": ["h1", {"class": "search-result-story__headline"}],
+     "query_time_published": ["time", {"class": "published-at"}],
+     "query_body": ["div", {"class": "search-result-story__body"}]
+ }
+ 
+ 
+ def parser_url(query_string, page,
+                sort_by="sort_by_oldest",
+                source="source_from_bloomberg"):
+     url = "https://www.bloomberg.com/"
+     # add search query
+     url = url + "search?query=" + query_string + "&"
+     # add sort by
+     url = url + "sort=" + BLOOMBERG_params[sort_by] + "&"
+     # add time to query -- use present time
+     url = url + "sites=" + BLOOMBERG_params[source] + "&"
+     # add page number
+     url = url + "page=" + str(page)
+     return url
+ 
+ 
+ def get_rid_off_key(list_contents):
+     body_string = ""
+     for substring in list_contents:
+         if (type(substring) == bs4.element.Tag):
+             # join all body string and
+             # eliminate highlight query string key
+             body_string += substring.string
+         else:
+             if (type(substring.string) == bs4.element.NavigableString):
+                 body_string += substring.string
+     return(body_string)
+ 
+ 
+ def extract_from_url(url):
+     try:
+         with requests.get(url) as response:
+             html_of_page = response.read()
+             soup_object = BeautifulSoup(html_of_page, "lxml")
+         # Extract list of news in soup object
+         param_to_find = DATA_TO_EXTRACT["query_list_news"]
+         list_of_news = soup_object.find_all(param_to_find[0],
+                                             attrs=param_to_find[1])
+         if (len(list_of_news) == 0):
+             return None
+         # create list result extracted
+         result = []
+         for block_new in list_of_news:
+             # extract time from block
+             param_to_find = DATA_TO_EXTRACT["query_time_published"]
+             time = block_new.find_all(param_to_find[0],
+                                       attrs=param_to_find[1])
+             time = time[0]["datetime"]
+ 
+             # extract new headline
+             param_to_find = DATA_TO_EXTRACT["query_headline"]
+             headline = block_new.find_all(param_to_find[0],
+                                           attrs=param_to_find[1])
+             headline = get_rid_off_key(headline[0].a.contents)
+ 
+             # extract new body list if string
+             param_to_find = DATA_TO_EXTRACT["query_body"]
+             body = block_new.find_all(param_to_find[0],
+                                       attrs=param_to_find[1])
+             print(body)
+ 
+             body_string = get_rid_off_key(body[0].contents)
+             extracted_from_block = {"time": time,
+                                     "headline": headline,
+                                     "body": body_string}
+             # for debug :
+             # print("\t".join(extracted_from_block))
+             if len(body_string) >= 5:
+                 result.append(extracted_from_block)
+     except Exception as inst:
+         print("Something whenwrong :)", inst)
+         print("ULR: ", url)
+         result = []
+     return(result)
+ 
+ 
+ def Query(key, max_page=5000):
+     # Init page and looping until return None
+     page = 1
+     result = "not None"
+     all_result_query = []
+     error = 0
+     while True and page < max_page:
+         print("Colected: %d articles" % len(all_result_query))
+         new_url = parser_url(key, page)
+         result = extract_from_url(new_url)
+         if len(result) > 0 or error > 10:
+             page += 1
+             error = 0
+         else:
+             error += 1
+ 
+         if result is not None:
+             all_result_query += result
+         else:
+             break
+     return(all_result_query)
+ 
+ 
+ if __name__ == "__main__":
+     print("Begin query information about: ", sys.argv[1])
+     print("Then will save result in: ", sys.argv[2])
+ 
+     News = Query(sys.argv[1], int(sys.argv[4]))
+     file_name1 = sys.argv[2]
+ 
+     with open(file_name1, "w") as W:
+         json.dump(News, W, indent=1)
+ 
+     file_name2 = sys.argv[3]
+     with open(file_name2, "w") as W:
+         W.write("\n".join([new["body"] for new in News]))
--- a/소스코드/event embedding/Pre-Process/extractVocab.py 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Pre-Process/extractVocab.py 0 → 100644
View file @ac58ec2
+ #!/usr/bin/env python3
+ # -*- coding: utf-8 -*-
+ """
+ Created on Mon Mar 20 17:52:11 2017
+ 
+ @author: red-sky
+ """
+ import sys
+ import json
+ import numpy as np
+ 
+ 
+ def updateDict(words, dictUp):
+     # update word dictionary with given "words" and the dict "dictUp"
+     for w in words:
+         if w in dictUp:
+             dictUp[w] += 1
+         else:
+             dictUp[w] = 0
+     return dictUp
+ 
+ def extractVocab(eventsFile, fromIndex=0, toIndex="END"):
+     # from Events file, extract infor about words and create a mapping
+     vocab = dict()
+     with open(eventsFile, "r") as file:
+         list_events = file.read().strip().splitlines()
+         if toIndex == -1:
+             list_events = list_events[fromIndex:]
+         else:
+             list_events = sorted(set(list_events[fromIndex:toIndex]))
+     for i, event in enumerate(list_events):
+         if event[0] != "\t":
+             index = i
+             break
+     list_events = list_events[index:]
+     for event in list_events:
+         event = event.split("\t")
+         words = event[1].split(" ") + \
+             event[2].split(" ") + \
+             event[3].split(" ")
+         vocab = updateDict(words, vocab)
+     vocab_words = vocab.keys()
+     support_words = ["NOISEWORDS"]
+     vocab_words = support_words + \
+         sorted(vocab_words, key=lambda x: vocab[x], reverse=True)
+     IndexWords = range(len(vocab_words))
+     Count = ["NOISEWORDS"] + [vocab[w] for w in vocab_words[1:]]
+     result = [dict(zip(vocab_words, Count)),
+               dict(zip(IndexWords, vocab_words)),
+               dict(zip(vocab_words, IndexWords))]
+     return result, list_events
+ 
+ 
+ def convertEvent(eventsFile, vocabMapping, countMin=5):
+     # convert all Events to index for training
+     wordCount, _, word2index = vocabMapping
+     Events = []
+     with open(eventsFile, "r") as file:
+         list_events = file.read().strip().splitlines()
+ 
+     for event in list_events:
+         event = event.split("\t")
+         list_obj = [event[1].split(" "),
+                     event[2].split(" "),
+                     event[3].split(" ")]
+ 
+         # Covert only words that appear more than countMin
+         wordsIndexed = []
+         for obj in list_obj:
+             objIndex = []
+             for w in obj:
+                 if wordCount[w] >= countMin:
+                     objIndex.append(word2index[w])
+                 else:
+                     objIndex.append(0)
+             wordsIndexed.append(objIndex)
+         Events.append(wordsIndexed)
+     return Events
+ 
+ 
+ if __name__ == "__main__":
+     # in
+     EventPath = "../../Thesis_data/Apple_query_result_body.txt"
+     fromIndex = 0
+     toIndex = -1
+     minCountWord = 5
+     # out
+     EventNewPath = "./Events_for_training.txt"
+     VocabPath = "./Vocab_in_events_for_training.json"
+     IndexdEventPath = "./IndexedEvents_for_training.npy"
+ 
+     vocabMapping, EventNew = extractVocab(EventPath, fromIndex, toIndex)
+     with open(VocabPath, "w") as W:
+         json.dump(vocabMapping, W, indent=2)
+ 
+     with open(EventNewPath, "w") as W:
+         W.write("\n".join(EventNew))
+ 
+     indexed_events = convertEvent(EventNewPath, vocabMapping, minCountWord)
+     np.save(arr=np.array(indexed_events), file=IndexdEventPath)
--- a/소스코드/event embedding/Pre-Process/findDateOfEvents.py 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Pre-Process/findDateOfEvents.py 0 → 100644
View file @ac58ec2
+ #!/usr/bin/env python3
+ # -*- coding: utf-8 -*-
+ """
+ Created on Mon Mar 20 11:58:54 2017
+ 
+ @author: red-sky
+ """
+ 
+ import sys
+ import json
+ 
+ 
+ def findDate(news_body, list_news):
+     date = ""
+     for ind, new in enumerate(list_news):
+         if news_body in new["body"]:
+             date = new["time"]
+             break
+     return date
+ 
+ 
+ def extractAllDate(list_events, list_news, choosedInfor=[1, 2, 3, 0, 6]):
+     list_result = []
+     N = len(list_events)
+     i = 0.0
+     for event in list_events:
+         i += 1
+         if i % 1000 == 0:
+             print("Done %f percents" % (i/N*100))
+         date = [findDate(event[6], list_news)]
+         infor = date + [event[i] for i in choosedInfor]
+         list_result.append(infor)
+     return list_result
+ 
+ if __name__ == "__main__":
+     events = open(sys.argv[1], "r").read().strip().splitlines()
+     events = [event.split("\t") for event in events
+               if len(event.split("\t")) > 5]
+     news = json.load(open(sys.argv[2], "r"))
+     result = extractAllDate(events, news)
+ 
+     with open(sys.argv[3], "w") as W:
+         for line in result[1:]:
+             W.write("\t".join(line)+"\n")
--- a/소스코드/event embedding/Pre-Process_2/processing_for_model2.py 0 → 100644
View file @ac58ec2
+++ b/소스코드/event embedding/Pre-Process_2/processing_for_model2.py 0 → 100644
View file @ac58ec2
+ #!/usr/bin/env python3
+ # -*- coding: utf-8 -*-
+ """
+ Created on Thu Apr 13 16:57:11 2017
+ 
+ @author: red-sky
+ """
+ import sys
+ import numpy as np
+ import pickle
+ import pandas as pd
+ 
+ 
+ def main(VectorsPath, EventPath, StockPricePath, days):
+ 
+     with open(VectorsPath, "rb") as H:
+         Vec = pickle.load(H)
+         Vectors = np.array([list(b[0]) for a, b in Vec.values()])
+ #    Vectors = np.load(VectorsPath)
+     with open(EventPath, "r") as H:
+         F = np.array([a.split("\t")[0:4] for a in H.read().splitlines()])
+ 
+     D = {}
+     for date, vec in zip(F[:, 0], Vectors):
+         if date[:10] in D:
+             D[date[:10]].append(vec)
+         else:
+             D[date[:10]] = [vec]
+ 
+     D2 = {}
+     for date in sorted(D.keys()):
+         D2[date] = np.mean(D[date], 0)
+ 
+     Dates = np.array(sorted(D2.keys()))
+     SampleIndex = [list(range(i-days, i)) for i in range(5, len(Dates))]
+     DataX = []
+     DateX = []
+     for listIndex in SampleIndex:
+         DataX.append([D2[date] for date in Dates[listIndex]])
+         DateX.append(Dates[listIndex[-1]])
+ 
+     Df = pd.read_csv(StockPricePath)
+     LabelY = []
+     DataX_yesData = []
+     for i, date in enumerate(DateX):
+         retu = list(Df.loc[Df["Date"] == date]["ReturnOpen"])
+         print(retu)
+         if len(retu) > 0:
+             retu = float(retu[0])*100
+             if retu > 0:
+                 LabelY.append([1, 0])
+             if retu < -0:
+                 LabelY.append([0, 1])
+             if retu <= 0 and retu >= -0:
+                 LabelY.append([0, 1])
+             DataX_yesData.append(list(DataX[i]))
+             print(date)
+ #        else:
+ 
+     dataX = np.array(DataX_yesData)
+     dataY = np.array(LabelY)
+     print("DataX:", dataX.shape)
+     print("DataY:", dataY.shape, np.sum(dataY, 0) / np.sum(dataY))
+     return (dataX, dataY)
+ 
+ if __name__ == "__main__":
+     VectorsPath = sys.argv[1]
+     EventPath = sys.argv[2]
+     StockPricePath = sys.argv[3]
+     days = int(sys.argv[5])
+     DataX, LabelY = main(VectorsPath, EventPath, StockPricePath, days)
+     DataPath = sys.argv[4]
+     np.save(arr=DataX, file=DataPath+"/DailyVector" + sys.argv[5] + ".npy")
+     np.save(arr=LabelY, file=DataPath+"/DailyReturn" + sys.argv[5] + ".npy")
--- a/진행보고서/최종 보고서.doc
View file @ac58ec2
+++ b/진행보고서/최종 보고서.doc
View file @ac58ec2