Showing
41 changed files
with
984 additions
and
0 deletions
소스코드/Bert_CNN_English.ipynb
0 → 100644
This diff could not be displayed because it is too large.
소스코드/Korean pre processing.ipynb
0 → 100644
This diff is collapsed. Click to expand it.
소스코드/data/apple_combined_data.csv
0 → 100644
This diff could not be displayed because it is too large.
소스코드/data/apple_combined_data2015 .csv
0 → 100644
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
소스코드/event embedding/Data/EventNew.txt
0 → 100644
This diff could not be displayed because it is too large.
소스코드/event embedding/Data/IndexedEvents.npy
0 → 100644
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
This file is too large to display.
This file is too large to display.
No preview for this file type
No preview for this file type
1 | +{"class_name": "Sequential", "config": [{"class_name": "Flatten", "config": {"name": "flatten_1", "trainable": true, "batch_input_shape": [null, 5, 80], "dtype": "float32"}}, {"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "units": 512, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "rate": 0.8}}, {"class_name": "Dense", "config": {"name": "dense_2", "trainable": true, "units": 1024, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout_2", "trainable": true, "rate": 0.8}}, {"class_name": "Dropout", "config": {"name": "dropout_3", "trainable": true, "rate": 0.8}}, {"class_name": "Dense", "config": {"name": "dense_3", "trainable": true, "units": 2, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}], "keras_version": "2.0.2", "backend": "theano"} | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
No preview for this file type
1 | +{"class_name": "Sequential", "config": [{"class_name": "Conv1D", "config": {"name": "conv1d_1", "trainable": true, "batch_input_shape": [null, 5, 80], "dtype": "float32", "filters": 128, "kernel_size": [1], "strides": [1], "padding": "valid", "dilation_rate": [1], "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Conv1D", "config": {"name": "conv1d_2", "trainable": true, "filters": 128, "kernel_size": [3], "strides": [1], "padding": "same", "dilation_rate": [1], "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "MaxPooling1D", "config": {"name": "max_pooling1d_1", "trainable": true, "strides": [2], "pool_size": [2], "padding": "valid"}}, {"class_name": "Flatten", "config": {"name": "flatten_1", "trainable": true}}, {"class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "rate": 0.8}}, {"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "units": 2, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}], "keras_version": "2.0.2", "backend": "theano"} | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
소스코드/event embedding/Data/ModelStock_result/Proposed_Model/model2_price_move_predict.hdf5
0 → 100644
No preview for this file type
소스코드/event embedding/Data/ModelStock_result/Proposed_Model/model2_price_move_predict.json
0 → 100644
1 | +{"class_name": "Sequential", "config": [{"class_name": "LSTM", "config": {"name": "lstm_1", "trainable": true, "batch_input_shape": [null, 5, 80], "dtype": "float32", "return_sequences": false, "go_backwards": false, "stateful": false, "unroll": false, "implementation": 0, "units": 256, "activation": "tanh", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": {"class_name": "L1L2", "config": {"l1": 0.0, "l2": 0.0010000000474974513}}, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0}}, {"class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "rate": 0.6}}, {"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "units": 2, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": {"class_name": "L1L2", "config": {"l1": 0.0, "l2": 0.0010000000474974513}}, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}], "keras_version": "2.0.2", "backend": "theano"} | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
소스코드/event embedding/Data/Vocab.txt
0 → 100644
This diff could not be displayed because it is too large.
소스코드/event embedding/Data/data_stock.dat
0 → 100644
This diff is collapsed. Click to expand it.
소스코드/event embedding/Data/data_stock_2.dat
0 → 100644
This diff is collapsed. Click to expand it.
1 | +#!/usr/bin/env python3 | ||
2 | +# -*- coding: utf-8 -*- | ||
3 | +""" | ||
4 | +Created on Mon Mar 20 23:41:51 2017 | ||
5 | + | ||
6 | +@author: red-sky | ||
7 | +""" | ||
8 | + | ||
9 | + | ||
10 | +import numpy as np | ||
11 | +import theano | ||
12 | +from theano import tensor as T | ||
13 | + | ||
14 | + | ||
15 | +class EmbeddingLayer(object): | ||
16 | + def __init__(self, num_vocab, word_dim, rng, embedding_w=None): | ||
17 | + ''' | ||
18 | + word_dim :: dimension of the word embeddings | ||
19 | + num_vocab :: number of word embeddings in the vocabulary | ||
20 | + embedding_w :: pre-train word vector | ||
21 | + ''' | ||
22 | + | ||
23 | + if embedding_w is None: | ||
24 | + word_vectors = rng.uniform(-1.0, 1.0, (num_vocab, word_dim)) | ||
25 | + self.embedding_w = theano.shared(word_vectors, | ||
26 | + name="EmbeddingLayer_W") \ | ||
27 | + .astype(theano.config.floatX) | ||
28 | + else: | ||
29 | + self.embedding_w = theano.shared(embedding_w, | ||
30 | + name="EmbeddingLayer_W") \ | ||
31 | + .astype(theano.config.floatX) | ||
32 | + | ||
33 | + self.params = [self.embedding_w] | ||
34 | + self.infor = [num_vocab, word_dim] | ||
35 | + | ||
36 | + def words_ind_2vec(self, index): | ||
37 | + map_word_vectors = self.embedding_w[index] | ||
38 | + output = T.mean(map_word_vectors, axis=0) | ||
39 | + return output, map_word_vectors | ||
40 | + | ||
41 | + | ||
42 | +if __name__ == "__main__": | ||
43 | + rng = np.random.RandomState(220495) | ||
44 | + arrWords = T.ivector("words") | ||
45 | + EMBD = EmbeddingLayer(100, 150, rng=rng) | ||
46 | + Word2Vec = theano.function( | ||
47 | + inputs=[arrWords], | ||
48 | + outputs=EMBD.words_ind_2vec(arrWords) | ||
49 | + ) | ||
50 | + Vec = Word2Vec([1, 2, 3, 4]) | ||
51 | + Vec = Word2Vec([2, 3, 4]) | ||
52 | + print("Dim: ", Vec.shape) | ||
53 | + print("Val: ", Vec) |
This diff is collapsed. Click to expand it.
1 | +#!/usr/bin/env python3 | ||
2 | +# -*- coding: utf-8 -*- | ||
3 | +""" | ||
4 | +Created on Sat Mar 25 16:13:18 2017 | ||
5 | + | ||
6 | +@author: red-sky | ||
7 | +""" | ||
8 | + | ||
9 | +import theano | ||
10 | +import numpy as np | ||
11 | +import theano.tensor as T | ||
12 | +from SmallUtils import createShareVar | ||
13 | + | ||
14 | + | ||
15 | +class RoleDependentLayer(object): | ||
16 | + def __init__(self, left_dependent, right_dependent, rng, | ||
17 | + n_in=100, n_out=4, trainedParams=None, | ||
18 | + name="RoleDependentEmbedding_"): | ||
19 | + if trainedParams is None: | ||
20 | + trainedParams = { | ||
21 | + name: { | ||
22 | + "T": None, "W1": None, "W2": None, "b": None | ||
23 | + } | ||
24 | + } | ||
25 | + | ||
26 | + if trainedParams[name]["T"] is not None: | ||
27 | + assert trainedParams[name]["T"].shape == (n_out, n_in, n_in) | ||
28 | + self.T = theano.shared(value=trainedParams[name]["T"], | ||
29 | + name=name+"T", borrow=True) | ||
30 | + else: | ||
31 | + self.T = createShareVar(rng=rng, name=name+"T", | ||
32 | + factor_for_init=n_out + n_in, | ||
33 | + dim=(n_out, n_in, n_in)) | ||
34 | + | ||
35 | + if trainedParams[name]["W1"] is not None: | ||
36 | + assert trainedParams[name]["W1"].shape == (n_in, n_out) | ||
37 | + self.W1 = theano.shared(value=trainedParams[name]["W1"], | ||
38 | + name=name+"W1", borrow=True) | ||
39 | + else: | ||
40 | + self.W1 = createShareVar(rng=rng, name=name+"W1", | ||
41 | + factor_for_init=n_out + n_in, | ||
42 | + dim=(n_in, n_out)) | ||
43 | + | ||
44 | + if trainedParams[name]["W2"] is not None: | ||
45 | + assert trainedParams[name]["W2"].shape == (n_in, n_out) | ||
46 | + self.W2 = theano.shared(value=trainedParams[name]["W2"], | ||
47 | + name=name+"W2", borrow=True) | ||
48 | + else: | ||
49 | + self.W2 = createShareVar(rng=rng, name=name+"W2", | ||
50 | + factor_for_init=n_out + n_in, | ||
51 | + dim=(n_in, n_out)) | ||
52 | + | ||
53 | + if trainedParams[name]["b"] is not None: | ||
54 | + assert trainedParams[name]["b"].shape == (n_out,) | ||
55 | + self.b = theano.shared(value=trainedParams[name]["b"], | ||
56 | + name=name+"b", borrow=True) | ||
57 | + else: | ||
58 | + b_values = np.zeros(shape=(n_out,), dtype=theano.config.floatX) | ||
59 | + self.b = theano.shared(value=b_values, name=name+"b", borrow=True) | ||
60 | + | ||
61 | + # list of layer params | ||
62 | + self.params = [self.T, self.W1, self.W2, self.b] | ||
63 | + | ||
64 | + # L2 regulation | ||
65 | + self.L2 = sum([(param**2).sum() for param in self.params]) | ||
66 | + | ||
67 | + # Bi-linear step | ||
68 | + def one_kernel(Tk, left, right): | ||
69 | + first_bi_libear = theano.dot(left, Tk) | ||
70 | + seccon_bi_linear = theano.dot(first_bi_libear, right) | ||
71 | + return(seccon_bi_linear.flatten()) | ||
72 | + | ||
73 | + bi_1, _ = theano.scan( | ||
74 | + fn=one_kernel, | ||
75 | + sequences=[self.T], | ||
76 | + non_sequences=[left_dependent, right_dependent], | ||
77 | + n_steps=n_out | ||
78 | + ) | ||
79 | + | ||
80 | + # Feed forward network step | ||
81 | + feedforward_step1 = theano.dot(left_dependent, self.W1) | ||
82 | + feedforward_step2 = theano.dot(right_dependent, self.W2) | ||
83 | + feedforward_step3 = (feedforward_step1 + | ||
84 | + feedforward_step2.dimshuffle("x", 0) + | ||
85 | + self.b.dimshuffle("x", 0)) | ||
86 | + feedforward_step4 = bi_1.dimshuffle(1, 0) + feedforward_step3 | ||
87 | + self.output = theano.tensor.tanh(feedforward_step4) | ||
88 | + self.test = [feedforward_step3] | ||
89 | + | ||
90 | + def output_(self, left_dependent, right_dependent): | ||
91 | + | ||
92 | + def one_kernel(Tk, left, right): | ||
93 | + first_bi_libear = theano.dot(left, Tk) | ||
94 | + seccon_bi_linear = theano.dot(first_bi_libear, right) | ||
95 | + return(seccon_bi_linear.flatten()) | ||
96 | + | ||
97 | + bi_linear_tensor, _ = theano.scan( | ||
98 | + fn=one_kernel, | ||
99 | + sequences=[self.T], | ||
100 | + non_sequences=[left_dependent, right_dependent], | ||
101 | + n_steps=n_out | ||
102 | + ) | ||
103 | + | ||
104 | + bi_linear_tensor = bi_linear_tensor.dimshuffle(1, 0) | ||
105 | + feedforward_step1 = theano.dot(left_dependent, self.W1) | ||
106 | + feedforward_step2 = theano.dot(right_dependent, self.W2) | ||
107 | + feedforward_step3 = (feedforward_step1 + | ||
108 | + feedforward_step2.dimshuffle("x", 0) + | ||
109 | + self.b.dimshuffle("x", 0)) | ||
110 | + feedforward_step4 = bi_linear_tensor + feedforward_step3 | ||
111 | + output = theano.tensor.tanh(feedforward_step4) | ||
112 | + return(output) | ||
113 | + | ||
114 | + def get_params(self): | ||
115 | + trainedParams = { | ||
116 | + "T": self.T.get_value(), "W1": self.W1.get_value(), | ||
117 | + "W2": self.W2.get_value(), "b": self.b.get_value() | ||
118 | + } | ||
119 | + return(trainedParams) |
1 | +#!/usr/bin/env python3 | ||
2 | +# -*- coding: utf-8 -*- | ||
3 | +""" | ||
4 | +Created on Sat Mar 25 15:55:14 2017 | ||
5 | + | ||
6 | +@author: red-sky | ||
7 | +""" | ||
8 | +import theano | ||
9 | +import theano.tensor as T | ||
10 | +import numpy as np | ||
11 | + | ||
12 | +def createShareVar(rng, dim, name, factor_for_init): | ||
13 | + var_values = np.asarray( | ||
14 | + rng.uniform( | ||
15 | + low=-np.sqrt(6.0 / factor_for_init), | ||
16 | + high=np.sqrt(6.0 / factor_for_init), | ||
17 | + size=dim, | ||
18 | + ) | ||
19 | + ) | ||
20 | + Var = theano.shared(value=var_values, name=name, borrow=True) | ||
21 | + return Var | ||
22 | + | ||
23 | + | ||
24 | +def adadelta(lr, tparams, cost, grads, listInput): | ||
25 | + """ | ||
26 | + An adaptive learning rate optimizer | ||
27 | + | ||
28 | + Parameters | ||
29 | + ---------- | ||
30 | + lr : Theano SharedVariable | ||
31 | + Initial learning rate | ||
32 | + tpramas: Theano SharedVariable | ||
33 | + Model parameters | ||
34 | + grads: Theano variable | ||
35 | + Gradients of cost w.r.t to parameres | ||
36 | + | ||
37 | + cost: Theano variable | ||
38 | + Objective fucntion to minimize | ||
39 | + | ||
40 | + Notes | ||
41 | + ----- | ||
42 | + For more information, see [ADADELTA]_. | ||
43 | + | ||
44 | + .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning | ||
45 | + Rate Method*, arXiv:1212.5701. | ||
46 | + """ | ||
47 | + np_float = np.asarray(0., dtype=theano.config.floatX) | ||
48 | + zipped_grads = [theano.shared(p.get_value() * np_float, | ||
49 | + name='%s_grad' % k) | ||
50 | + for k, p in enumerate(tparams)] | ||
51 | + running_up2 = [theano.shared(p.get_value() * np_float, | ||
52 | + name='%s_rup2' % k) | ||
53 | + for k, p in enumerate(tparams)] | ||
54 | + running_grads2 = [theano.shared(p.get_value() * np_float, | ||
55 | + name='%s_rgrad2' % k) | ||
56 | + for k, p in enumerate(tparams)] | ||
57 | + | ||
58 | + zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] | ||
59 | + rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) | ||
60 | + for rg2, g in zip(running_grads2, grads)] | ||
61 | + | ||
62 | + f_grad_shared = theano.function(inputs=listInput, | ||
63 | + outputs=cost, | ||
64 | + updates=zgup + rg2up, | ||
65 | + name='adadelta_f_grad_shared') | ||
66 | + | ||
67 | + updir = [-T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg | ||
68 | + for zg, ru2, rg2 in zip(zipped_grads, | ||
69 | + running_up2, | ||
70 | + running_grads2)] | ||
71 | + ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) | ||
72 | + for ru2, ud in zip(running_up2, updir)] | ||
73 | + param_up = [(p, p + ud) for p, ud in zip(tparams, updir)] | ||
74 | + | ||
75 | + f_update = theano.function([lr], [], updates=ru2up + param_up, | ||
76 | + on_unused_input='ignore', | ||
77 | + name='adadelta_f_update') | ||
78 | + | ||
79 | + return f_grad_shared, f_update | ||
80 | + | ||
81 | + | ||
82 | +def ADAM_OPTIMIZER(loss, all_params, learning_rate=0.001, | ||
83 | + b1=0.9, b2=0.999, e=1e-8, gamma=1-1e-8): | ||
84 | + """ | ||
85 | + CITE: http://sebastianruder.com/optimizing-gradient-descent/index.html#adam | ||
86 | + ADAM update rules | ||
87 | + Default values are taken from [Kingma2014] | ||
88 | + References: | ||
89 | + [Kingma2014] Kingma, Diederik, and Jimmy Ba. | ||
90 | + "Adam: A Method for Stochastic Optimization." | ||
91 | + arXiv preprint arXiv:1412.6980 (2014). | ||
92 | + http://arxiv.org/pdf/1412.6980v4.pdf | ||
93 | + """ | ||
94 | + updates = [] | ||
95 | + all_grads = theano.grad(loss, all_params) | ||
96 | + alpha = learning_rate | ||
97 | + t = theano.shared(np.float32(1)) | ||
98 | + # (Decay the first moment running average coefficient) | ||
99 | + b1_t = b1*gamma**(t-1) | ||
100 | + | ||
101 | + for params_previous, g in zip(all_params, all_grads): | ||
102 | + init_moment = np.zeros(params_previous.get_value().shape, | ||
103 | + dtype=theano.config.floatX) | ||
104 | + # (the mean) | ||
105 | + first_moment = theano.shared(init_moment) | ||
106 | + # (the uncentered variance) | ||
107 | + second_moment = theano.shared(init_moment) | ||
108 | + | ||
109 | + # (Update biased first moment estimate) | ||
110 | + bias_m = b1_t*first_moment + (1 - b1_t)*g | ||
111 | + | ||
112 | + # (Update biased second raw moment estimate) | ||
113 | + bias_v = b2*second_moment + (1 - b2)*g**2 | ||
114 | + | ||
115 | + # (Compute bias-corrected first moment estimate) | ||
116 | + unbias_m = bias_m / (1-b1**t) | ||
117 | + | ||
118 | + # (Compute bias-corrected second raw moment estimate) | ||
119 | + unbias_v = bias_v / (1-b2**t) | ||
120 | + | ||
121 | + # (Update parameters) | ||
122 | + update_term = (alpha * unbias_m) / (T.sqrt(unbias_v) + e) | ||
123 | + params_new = params_previous - update_term | ||
124 | + | ||
125 | + updates.append((first_moment, bias_m)) | ||
126 | + updates.append((second_moment, bias_v)) | ||
127 | + updates.append((params_previous, params_new)) | ||
128 | + updates.append((t, t + 1.)) | ||
129 | + return updates | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
No preview for this file type
No preview for this file type
No preview for this file type
1 | +#!/usr/bin/env python3 | ||
2 | +# -*- coding: utf-8 -*- | ||
3 | +""" | ||
4 | +Created on Thu Apr 13 17:01:36 2017 | ||
5 | + | ||
6 | +@author: red-sky | ||
7 | +""" | ||
8 | + | ||
9 | +import sys | ||
10 | +import numpy as np | ||
11 | +np.random.seed(280295) | ||
12 | +import keras.backend as K | ||
13 | +from keras.models import Sequential | ||
14 | +from keras.layers import Dense, Activation, Dropout, Flatten | ||
15 | +from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D | ||
16 | +from keras.callbacks import ModelCheckpoint, EarlyStopping | ||
17 | +from keras import regularizers, optimizers | ||
18 | + | ||
19 | + | ||
20 | +def recall(y_true, y_pred): | ||
21 | + """Recall metric. | ||
22 | + | ||
23 | + Only computes a batch-wise average of recall. | ||
24 | + | ||
25 | + Computes the recall, a metric for multi-label classification of | ||
26 | + how many relevant items are selected. | ||
27 | + """ | ||
28 | + true_positives = K.sum(K.round(K.clip(y_true[:, 1] * y_pred[:, 1], 0, 1))) | ||
29 | + possible_positives = K.sum(K.round(K.clip(y_true[:, 1], 0, 1))) | ||
30 | + recall = true_positives / (possible_positives + K.epsilon()) | ||
31 | + return recall | ||
32 | + | ||
33 | + | ||
34 | +def precision(y_true, y_pred): | ||
35 | + """Precision metric. | ||
36 | + | ||
37 | + Only computes a batch-wise average of precision. | ||
38 | + | ||
39 | + Computes the precision, a metric for multi-label classification of | ||
40 | + how many selected items are relevant. | ||
41 | + """ | ||
42 | + true_positives = K.sum(K.round(K.clip(y_true[:, 1] * y_pred[:, 1], 0, 1))) | ||
43 | + predicted_positives = K.sum(K.round(K.clip(y_pred[:, 1], 0, 1))) | ||
44 | + precision = true_positives / (predicted_positives + K.epsilon()) | ||
45 | + return precision | ||
46 | + | ||
47 | + | ||
48 | +def fbeta_score(y_true, y_pred): | ||
49 | + | ||
50 | + # If there are no true positives, fix the F score at 0 like sklearn. | ||
51 | + if K.sum(K.round(K.clip(y_true, 0, 1))) == 0: | ||
52 | + return 0 | ||
53 | + | ||
54 | + p = precision(y_true, y_pred) | ||
55 | + r = recall(y_true, y_pred) | ||
56 | + bb = 1 ** 2 | ||
57 | + fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon()) | ||
58 | + return fbeta_score | ||
59 | + | ||
60 | + | ||
61 | +def main(dataX_path, dataY_path, result_path, | ||
62 | + n_epoch, input_dim, days): | ||
63 | + | ||
64 | + # load data | ||
65 | + np.random.seed(2204) | ||
66 | + X = np.load(dataX_path) | ||
67 | + Y = np.load(dataY_path) | ||
68 | + | ||
69 | + # build Model | ||
70 | + model = Sequential() | ||
71 | + model.add(Conv1D(128, 1, activation='relu', input_shape=(days, input_dim))) | ||
72 | + model.add(Conv1D(128, 3, activation='relu', padding='same')) | ||
73 | + model.add(MaxPooling1D(2)) | ||
74 | + model.add(Flatten()) | ||
75 | + model.add(Dropout(0.8)) | ||
76 | + model.add(Dense(2, activation='softmax')) | ||
77 | + adam = optimizers.Adam(lr=0.001) | ||
78 | + model.compile(loss='categorical_crossentropy', | ||
79 | + optimizer=adam, | ||
80 | + metrics=['accuracy', recall, precision, fbeta_score]) | ||
81 | + | ||
82 | + # model Compile | ||
83 | + model_name = result_path+'model2_price_move_predict.hdf5' | ||
84 | + checkpointer = ModelCheckpoint(filepath=model_name, | ||
85 | + monitor='val_fbeta_score', | ||
86 | + verbose=2, save_best_only=True) | ||
87 | + earlystopper = EarlyStopping(monitor='val_loss', patience=20, verbose=2) | ||
88 | + | ||
89 | + outmodel = open(result_path+'model2_price_move_predict.json', 'w') | ||
90 | + outmodel.write(model.to_json()) | ||
91 | + outmodel.close() | ||
92 | + | ||
93 | + # process Training | ||
94 | + model.fit(X, Y, batch_size=32, verbose=2, | ||
95 | + validation_split=0.1, epochs=n_epoch, | ||
96 | + callbacks=[checkpointer]) | ||
97 | + | ||
98 | + | ||
99 | +if __name__ == "__main__": | ||
100 | + dataX = sys.argv[1] | ||
101 | + dataY = sys.argv[2] | ||
102 | + model_path = sys.argv[3] | ||
103 | + n_epoch = int(sys.argv[4]) | ||
104 | + input_dim = int(sys.argv[5]) | ||
105 | + days = int(sys.argv[6]) | ||
106 | + main(dataX, dataY, model_path, n_epoch, input_dim, days) |
1 | +#!/usr/bin/env python3 | ||
2 | +# -*- coding: utf-8 -*- | ||
3 | +""" | ||
4 | +Created on Thu Apr 13 17:01:36 2017 | ||
5 | + | ||
6 | +@author: red-sky | ||
7 | +""" | ||
8 | + | ||
9 | +import sys | ||
10 | +import numpy as np | ||
11 | +np.random.seed(280295) | ||
12 | +import keras.backend as K | ||
13 | +from keras.models import Sequential | ||
14 | +from keras import regularizers, optimizers | ||
15 | +from keras.layers import Dense, Activation, LSTM, Dropout | ||
16 | +from keras.callbacks import ModelCheckpoint, EarlyStopping | ||
17 | + | ||
18 | +def recall(y_true, y_pred): | ||
19 | + """Recall metric. | ||
20 | + | ||
21 | + Only computes a batch-wise average of recall. | ||
22 | + | ||
23 | + Computes the recall, a metric for multi-label classification of | ||
24 | + how many relevant items are selected. | ||
25 | + """ | ||
26 | + true_positives = K.sum(K.round(K.clip(y_true[:, 0] * y_pred[:, 0], 0, 1))) | ||
27 | + possible_positives = K.sum(K.round(K.clip(y_true[:, 0], 0, 1))) | ||
28 | + recall = true_positives / (possible_positives + K.epsilon()) | ||
29 | + return recall | ||
30 | + | ||
31 | + | ||
32 | +def precision(y_true, y_pred): | ||
33 | + """Precision metric. | ||
34 | + | ||
35 | + Only computes a batch-wise average of precision. | ||
36 | + | ||
37 | + Computes the precision, a metric for multi-label classification of | ||
38 | + how many selected items are relevant. | ||
39 | + """ | ||
40 | + true_positives = K.sum(K.round(K.clip(y_true[:, 0] * y_pred[:, 0], 0, 1))) | ||
41 | + predicted_positives = K.sum(K.round(K.clip(y_pred[:, 0], 0, 1))) | ||
42 | + precision = true_positives / (predicted_positives + K.epsilon()) | ||
43 | + return precision | ||
44 | + | ||
45 | + | ||
46 | +def fbeta_score(y_true, y_pred): | ||
47 | + | ||
48 | + # If there are no true positives, fix the F score at 0 like sklearn. | ||
49 | + if K.sum(K.round(K.clip(y_true, 0, 1))) == 0: | ||
50 | + return 0 | ||
51 | + | ||
52 | + p = precision(y_true, y_pred) | ||
53 | + r = recall(y_true, y_pred) | ||
54 | + bb = 1 ** 2 | ||
55 | + fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon()) | ||
56 | + return fbeta_score | ||
57 | + | ||
58 | + | ||
59 | +def main(dataX_path, dataY_path, result_path, | ||
60 | + n_epoch, input_dim, days): | ||
61 | + | ||
62 | + # load data | ||
63 | + np.random.seed(2204) | ||
64 | + X = np.load(dataX_path) | ||
65 | + Y = np.load(dataY_path) | ||
66 | + | ||
67 | + # build Model | ||
68 | + model = Sequential() | ||
69 | + model.add(LSTM(256, input_shape=(days, input_dim), | ||
70 | + kernel_regularizer=regularizers.l2(0.001))) | ||
71 | + | ||
72 | + model.add(Dropout(0.6)) | ||
73 | + model.add(Dense(2, activation='softmax', | ||
74 | + kernel_regularizer=regularizers.l2(0.001))) | ||
75 | + adam = optimizers.Adam(lr=0.001) | ||
76 | + model.compile(loss='categorical_crossentropy', | ||
77 | + optimizer=adam, | ||
78 | + metrics=['accuracy', recall, precision, fbeta_score]) | ||
79 | + | ||
80 | + # model Compile | ||
81 | + model_name = result_path+'model2_price_move_predict.hdf5' | ||
82 | + checkpointer = ModelCheckpoint(filepath=model_name, | ||
83 | + monitor='val_fbeta_score', mode="max", | ||
84 | + verbose=2, save_best_only=True) | ||
85 | + earlystopper = EarlyStopping(monitor='val_loss', patience=20, verbose=2) | ||
86 | + | ||
87 | + outmodel = open(result_path+'model2_price_move_predict.json', 'w') | ||
88 | + outmodel.write(model.to_json()) | ||
89 | + outmodel.close() | ||
90 | + | ||
91 | + # process Training | ||
92 | + model.fit(X, Y, batch_size=32, verbose=2, | ||
93 | + validation_split=0.1, epochs=n_epoch, | ||
94 | + callbacks=[checkpointer]) | ||
95 | + | ||
96 | +if __name__ == "__main__": | ||
97 | + dataX = sys.argv[1] | ||
98 | + dataY = sys.argv[2] | ||
99 | + model_path = sys.argv[3] | ||
100 | + n_epoch = int(sys.argv[4]) | ||
101 | + input_dim = int(sys.argv[5]) | ||
102 | + days = int(sys.argv[6]) | ||
103 | + main(dataX, dataY, model_path, n_epoch, input_dim, days) |
1 | +#!/usr/bin/env python3 | ||
2 | +# -*- coding: utf-8 -*- | ||
3 | +""" | ||
4 | +Created on Thu Apr 13 17:01:36 2017 | ||
5 | + | ||
6 | +@author: red-sky | ||
7 | +""" | ||
8 | + | ||
9 | +import sys | ||
10 | +import numpy as np | ||
11 | +np.random.seed(280295) | ||
12 | +import keras.backend as K | ||
13 | +from keras.models import Sequential | ||
14 | +from keras.layers import Dense, Activation, Dropout, Flatten | ||
15 | +from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D | ||
16 | +from keras.callbacks import ModelCheckpoint, EarlyStopping | ||
17 | +from keras import regularizers, optimizers | ||
18 | + | ||
19 | + | ||
20 | +def recall(y_true, y_pred): | ||
21 | + """Recall metric. | ||
22 | + | ||
23 | + Only computes a batch-wise average of recall. | ||
24 | + | ||
25 | + Computes the recall, a metric for multi-label classification of | ||
26 | + how many relevant items are selected. | ||
27 | + """ | ||
28 | + true_positives = K.sum(K.round(K.clip(y_true[:, 0] * y_pred[:, 0], 0, 1))) | ||
29 | + possible_positives = K.sum(K.round(K.clip(y_true[:, 0], 0, 1))) | ||
30 | + recall = true_positives / (possible_positives + K.epsilon()) | ||
31 | + return recall | ||
32 | + | ||
33 | + | ||
34 | +def precision(y_true, y_pred): | ||
35 | + """Precision metric. | ||
36 | + | ||
37 | + Only computes a batch-wise average of precision. | ||
38 | + | ||
39 | + Computes the precision, a metric for multi-label classification of | ||
40 | + how many selected items are relevant. | ||
41 | + """ | ||
42 | + true_positives = K.sum(K.round(K.clip(y_true[:, 0] * y_pred[:, 0], 0, 1))) | ||
43 | + predicted_positives = K.sum(K.round(K.clip(y_pred[:, 0], 0, 1))) | ||
44 | + precision = true_positives / (predicted_positives + K.epsilon()) | ||
45 | + return precision | ||
46 | + | ||
47 | + | ||
48 | +def fbeta_score(y_true, y_pred): | ||
49 | + | ||
50 | + # If there are no true positives, fix the F score at 0 like sklearn. | ||
51 | + if K.sum(K.round(K.clip(y_true, 0, 1))) == 0: | ||
52 | + return 0 | ||
53 | + | ||
54 | + p = precision(y_true, y_pred) | ||
55 | + r = recall(y_true, y_pred) | ||
56 | + bb = 1 ** 2 | ||
57 | + fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon()) | ||
58 | + return fbeta_score | ||
59 | + | ||
60 | + | ||
61 | +def main(dataX_path, dataY_path, result_path, | ||
62 | + n_epoch, input_dim, days): | ||
63 | + | ||
64 | + # load data | ||
65 | + np.random.seed(2204) | ||
66 | + X = np.load(dataX_path) | ||
67 | + Y = np.load(dataY_path) | ||
68 | + | ||
69 | + # build Model | ||
70 | + model = Sequential() | ||
71 | + model.add(Flatten(input_shape=(days, input_dim))) | ||
72 | + model.add(Dense(512, activation='sigmoid')) | ||
73 | + model.add(Dropout(0.8)) | ||
74 | + model.add(Dense(1024, activation='sigmoid')) | ||
75 | + model.add(Dropout(0.8)) | ||
76 | +# model.add(Dense(1024, activation='sigmoid')) | ||
77 | + model.add(Dropout(0.8)) | ||
78 | + model.add(Dense(2, activation='softmax')) | ||
79 | + | ||
80 | + adam = optimizers.Adam(lr=0.001) | ||
81 | + model.compile(loss='categorical_crossentropy', | ||
82 | + optimizer=adam, | ||
83 | + metrics=['accuracy', recall, precision, fbeta_score]) | ||
84 | + | ||
85 | + # model Compile | ||
86 | + model_name = result_path+'model2_price_move_predict.hdf5' | ||
87 | + checkpointer = ModelCheckpoint(filepath=model_name, monitor='val_acc', | ||
88 | + verbose=2, save_best_only=True) | ||
89 | + earlystopper = EarlyStopping(monitor='val_loss', patience=20, verbose=2) | ||
90 | + | ||
91 | + outmodel = open(result_path+'model2_price_move_predict.json', 'w') | ||
92 | + outmodel.write(model.to_json()) | ||
93 | + outmodel.close() | ||
94 | + | ||
95 | + # process Training | ||
96 | + model.fit(X, Y, batch_size=32, verbose=2, | ||
97 | + validation_split=0.1, epochs=n_epoch, | ||
98 | + callbacks=[checkpointer]) | ||
99 | + | ||
100 | + | ||
101 | +if __name__ == "__main__": | ||
102 | + dataX = sys.argv[1] | ||
103 | + dataY = sys.argv[2] | ||
104 | + model_path = sys.argv[3] | ||
105 | + n_epoch = int(sys.argv[4]) | ||
106 | + input_dim = int(sys.argv[5]) | ||
107 | + days = int(sys.argv[6]) | ||
108 | + main(dataX, dataY, model_path, n_epoch, input_dim, days) |
1 | +#!/usr/bin/env python3 | ||
2 | +# -*- coding: utf-8 -*- | ||
3 | +""" | ||
4 | +Created on Thu Mar 16 21:57:57 2017 | ||
5 | + | ||
6 | +@author: red-sky | ||
7 | +""" | ||
8 | + | ||
9 | +import bs4 | ||
10 | +import json | ||
11 | +import sys | ||
12 | +import urllib.request as urlreq | ||
13 | +from bs4 import BeautifulSoup | ||
14 | +import requests | ||
15 | + | ||
16 | +BLOOMBERG_params = { | ||
17 | + "sort_by_newest": "time:desc", | ||
18 | + "sort_by_oldest": "time:asc", | ||
19 | + "source_from_bloomberg": "sites=bview", | ||
20 | + "end_time": "2017-03-12T15:20:16.240Z" | ||
21 | +} | ||
22 | + | ||
23 | +DATA_TO_EXTRACT = { | ||
24 | + "query_list_news": ["div", {"class": "search-result-story__container"}], | ||
25 | + "query_headline": ["h1", {"class": "search-result-story__headline"}], | ||
26 | + "query_time_published": ["time", {"class": "published-at"}], | ||
27 | + "query_body": ["div", {"class": "search-result-story__body"}] | ||
28 | +} | ||
29 | + | ||
30 | + | ||
31 | +def parser_url(query_string, page, | ||
32 | + sort_by="sort_by_oldest", | ||
33 | + source="source_from_bloomberg"): | ||
34 | + url = "https://www.bloomberg.com/" | ||
35 | + # add search query | ||
36 | + url = url + "search?query=" + query_string + "&" | ||
37 | + # add sort by | ||
38 | + url = url + "sort=" + BLOOMBERG_params[sort_by] + "&" | ||
39 | + # add time to query -- use present time | ||
40 | + url = url + "sites=" + BLOOMBERG_params[source] + "&" | ||
41 | + # add page number | ||
42 | + url = url + "page=" + str(page) | ||
43 | + return url | ||
44 | + | ||
45 | + | ||
46 | +def get_rid_off_key(list_contents): | ||
47 | + body_string = "" | ||
48 | + for substring in list_contents: | ||
49 | + if (type(substring) == bs4.element.Tag): | ||
50 | + # join all body string and | ||
51 | + # eliminate highlight query string key | ||
52 | + body_string += substring.string | ||
53 | + else: | ||
54 | + if (type(substring.string) == bs4.element.NavigableString): | ||
55 | + body_string += substring.string | ||
56 | + return(body_string) | ||
57 | + | ||
58 | + | ||
59 | +def extract_from_url(url): | ||
60 | + try: | ||
61 | + with requests.get(url) as response: | ||
62 | + html_of_page = response.read() | ||
63 | + soup_object = BeautifulSoup(html_of_page, "lxml") | ||
64 | + # Extract list of news in soup object | ||
65 | + param_to_find = DATA_TO_EXTRACT["query_list_news"] | ||
66 | + list_of_news = soup_object.find_all(param_to_find[0], | ||
67 | + attrs=param_to_find[1]) | ||
68 | + if (len(list_of_news) == 0): | ||
69 | + return None | ||
70 | + # create list result extracted | ||
71 | + result = [] | ||
72 | + for block_new in list_of_news: | ||
73 | + # extract time from block | ||
74 | + param_to_find = DATA_TO_EXTRACT["query_time_published"] | ||
75 | + time = block_new.find_all(param_to_find[0], | ||
76 | + attrs=param_to_find[1]) | ||
77 | + time = time[0]["datetime"] | ||
78 | + | ||
79 | + # extract new headline | ||
80 | + param_to_find = DATA_TO_EXTRACT["query_headline"] | ||
81 | + headline = block_new.find_all(param_to_find[0], | ||
82 | + attrs=param_to_find[1]) | ||
83 | + headline = get_rid_off_key(headline[0].a.contents) | ||
84 | + | ||
85 | + # extract new body list if string | ||
86 | + param_to_find = DATA_TO_EXTRACT["query_body"] | ||
87 | + body = block_new.find_all(param_to_find[0], | ||
88 | + attrs=param_to_find[1]) | ||
89 | + print(body) | ||
90 | + | ||
91 | + body_string = get_rid_off_key(body[0].contents) | ||
92 | + extracted_from_block = {"time": time, | ||
93 | + "headline": headline, | ||
94 | + "body": body_string} | ||
95 | + # for debug : | ||
96 | + # print("\t".join(extracted_from_block)) | ||
97 | + if len(body_string) >= 5: | ||
98 | + result.append(extracted_from_block) | ||
99 | + except Exception as inst: | ||
100 | + print("Something whenwrong :)", inst) | ||
101 | + print("ULR: ", url) | ||
102 | + result = [] | ||
103 | + return(result) | ||
104 | + | ||
105 | + | ||
106 | +def Query(key, max_page=5000): | ||
107 | + # Init page and looping until return None | ||
108 | + page = 1 | ||
109 | + result = "not None" | ||
110 | + all_result_query = [] | ||
111 | + error = 0 | ||
112 | + while True and page < max_page: | ||
113 | + print("Colected: %d articles" % len(all_result_query)) | ||
114 | + new_url = parser_url(key, page) | ||
115 | + result = extract_from_url(new_url) | ||
116 | + if len(result) > 0 or error > 10: | ||
117 | + page += 1 | ||
118 | + error = 0 | ||
119 | + else: | ||
120 | + error += 1 | ||
121 | + | ||
122 | + if result is not None: | ||
123 | + all_result_query += result | ||
124 | + else: | ||
125 | + break | ||
126 | + return(all_result_query) | ||
127 | + | ||
128 | + | ||
129 | +if __name__ == "__main__": | ||
130 | + print("Begin query information about: ", sys.argv[1]) | ||
131 | + print("Then will save result in: ", sys.argv[2]) | ||
132 | + | ||
133 | + News = Query(sys.argv[1], int(sys.argv[4])) | ||
134 | + file_name1 = sys.argv[2] | ||
135 | + | ||
136 | + with open(file_name1, "w") as W: | ||
137 | + json.dump(News, W, indent=1) | ||
138 | + | ||
139 | + file_name2 = sys.argv[3] | ||
140 | + with open(file_name2, "w") as W: | ||
141 | + W.write("\n".join([new["body"] for new in News])) |
1 | +#!/usr/bin/env python3 | ||
2 | +# -*- coding: utf-8 -*- | ||
3 | +""" | ||
4 | +Created on Mon Mar 20 17:52:11 2017 | ||
5 | + | ||
6 | +@author: red-sky | ||
7 | +""" | ||
8 | +import sys | ||
9 | +import json | ||
10 | +import numpy as np | ||
11 | + | ||
12 | + | ||
13 | +def updateDict(words, dictUp): | ||
14 | + # update word dictionary with given "words" and the dict "dictUp" | ||
15 | + for w in words: | ||
16 | + if w in dictUp: | ||
17 | + dictUp[w] += 1 | ||
18 | + else: | ||
19 | + dictUp[w] = 0 | ||
20 | + return dictUp | ||
21 | + | ||
22 | +def extractVocab(eventsFile, fromIndex=0, toIndex="END"): | ||
23 | + # from Events file, extract infor about words and create a mapping | ||
24 | + vocab = dict() | ||
25 | + with open(eventsFile, "r") as file: | ||
26 | + list_events = file.read().strip().splitlines() | ||
27 | + if toIndex == -1: | ||
28 | + list_events = list_events[fromIndex:] | ||
29 | + else: | ||
30 | + list_events = sorted(set(list_events[fromIndex:toIndex])) | ||
31 | + for i, event in enumerate(list_events): | ||
32 | + if event[0] != "\t": | ||
33 | + index = i | ||
34 | + break | ||
35 | + list_events = list_events[index:] | ||
36 | + for event in list_events: | ||
37 | + event = event.split("\t") | ||
38 | + words = event[1].split(" ") + \ | ||
39 | + event[2].split(" ") + \ | ||
40 | + event[3].split(" ") | ||
41 | + vocab = updateDict(words, vocab) | ||
42 | + vocab_words = vocab.keys() | ||
43 | + support_words = ["NOISEWORDS"] | ||
44 | + vocab_words = support_words + \ | ||
45 | + sorted(vocab_words, key=lambda x: vocab[x], reverse=True) | ||
46 | + IndexWords = range(len(vocab_words)) | ||
47 | + Count = ["NOISEWORDS"] + [vocab[w] for w in vocab_words[1:]] | ||
48 | + result = [dict(zip(vocab_words, Count)), | ||
49 | + dict(zip(IndexWords, vocab_words)), | ||
50 | + dict(zip(vocab_words, IndexWords))] | ||
51 | + return result, list_events | ||
52 | + | ||
53 | + | ||
54 | +def convertEvent(eventsFile, vocabMapping, countMin=5): | ||
55 | + # convert all Events to index for training | ||
56 | + wordCount, _, word2index = vocabMapping | ||
57 | + Events = [] | ||
58 | + with open(eventsFile, "r") as file: | ||
59 | + list_events = file.read().strip().splitlines() | ||
60 | + | ||
61 | + for event in list_events: | ||
62 | + event = event.split("\t") | ||
63 | + list_obj = [event[1].split(" "), | ||
64 | + event[2].split(" "), | ||
65 | + event[3].split(" ")] | ||
66 | + | ||
67 | + # Covert only words that appear more than countMin | ||
68 | + wordsIndexed = [] | ||
69 | + for obj in list_obj: | ||
70 | + objIndex = [] | ||
71 | + for w in obj: | ||
72 | + if wordCount[w] >= countMin: | ||
73 | + objIndex.append(word2index[w]) | ||
74 | + else: | ||
75 | + objIndex.append(0) | ||
76 | + wordsIndexed.append(objIndex) | ||
77 | + Events.append(wordsIndexed) | ||
78 | + return Events | ||
79 | + | ||
80 | + | ||
81 | +if __name__ == "__main__": | ||
82 | + # in | ||
83 | + EventPath = "../../Thesis_data/Apple_query_result_body.txt" | ||
84 | + fromIndex = 0 | ||
85 | + toIndex = -1 | ||
86 | + minCountWord = 5 | ||
87 | + # out | ||
88 | + EventNewPath = "./Events_for_training.txt" | ||
89 | + VocabPath = "./Vocab_in_events_for_training.json" | ||
90 | + IndexdEventPath = "./IndexedEvents_for_training.npy" | ||
91 | + | ||
92 | + vocabMapping, EventNew = extractVocab(EventPath, fromIndex, toIndex) | ||
93 | + with open(VocabPath, "w") as W: | ||
94 | + json.dump(vocabMapping, W, indent=2) | ||
95 | + | ||
96 | + with open(EventNewPath, "w") as W: | ||
97 | + W.write("\n".join(EventNew)) | ||
98 | + | ||
99 | + indexed_events = convertEvent(EventNewPath, vocabMapping, minCountWord) | ||
100 | + np.save(arr=np.array(indexed_events), file=IndexdEventPath) |
1 | +#!/usr/bin/env python3 | ||
2 | +# -*- coding: utf-8 -*- | ||
3 | +""" | ||
4 | +Created on Mon Mar 20 11:58:54 2017 | ||
5 | + | ||
6 | +@author: red-sky | ||
7 | +""" | ||
8 | + | ||
9 | +import sys | ||
10 | +import json | ||
11 | + | ||
12 | + | ||
13 | +def findDate(news_body, list_news): | ||
14 | + date = "" | ||
15 | + for ind, new in enumerate(list_news): | ||
16 | + if news_body in new["body"]: | ||
17 | + date = new["time"] | ||
18 | + break | ||
19 | + return date | ||
20 | + | ||
21 | + | ||
22 | +def extractAllDate(list_events, list_news, choosedInfor=[1, 2, 3, 0, 6]): | ||
23 | + list_result = [] | ||
24 | + N = len(list_events) | ||
25 | + i = 0.0 | ||
26 | + for event in list_events: | ||
27 | + i += 1 | ||
28 | + if i % 1000 == 0: | ||
29 | + print("Done %f percents" % (i/N*100)) | ||
30 | + date = [findDate(event[6], list_news)] | ||
31 | + infor = date + [event[i] for i in choosedInfor] | ||
32 | + list_result.append(infor) | ||
33 | + return list_result | ||
34 | + | ||
35 | +if __name__ == "__main__": | ||
36 | + events = open(sys.argv[1], "r").read().strip().splitlines() | ||
37 | + events = [event.split("\t") for event in events | ||
38 | + if len(event.split("\t")) > 5] | ||
39 | + news = json.load(open(sys.argv[2], "r")) | ||
40 | + result = extractAllDate(events, news) | ||
41 | + | ||
42 | + with open(sys.argv[3], "w") as W: | ||
43 | + for line in result[1:]: | ||
44 | + W.write("\t".join(line)+"\n") |
1 | +#!/usr/bin/env python3 | ||
2 | +# -*- coding: utf-8 -*- | ||
3 | +""" | ||
4 | +Created on Thu Apr 13 16:57:11 2017 | ||
5 | + | ||
6 | +@author: red-sky | ||
7 | +""" | ||
8 | +import sys | ||
9 | +import numpy as np | ||
10 | +import pickle | ||
11 | +import pandas as pd | ||
12 | + | ||
13 | + | ||
14 | +def main(VectorsPath, EventPath, StockPricePath, days): | ||
15 | + | ||
16 | + with open(VectorsPath, "rb") as H: | ||
17 | + Vec = pickle.load(H) | ||
18 | + Vectors = np.array([list(b[0]) for a, b in Vec.values()]) | ||
19 | +# Vectors = np.load(VectorsPath) | ||
20 | + with open(EventPath, "r") as H: | ||
21 | + F = np.array([a.split("\t")[0:4] for a in H.read().splitlines()]) | ||
22 | + | ||
23 | + D = {} | ||
24 | + for date, vec in zip(F[:, 0], Vectors): | ||
25 | + if date[:10] in D: | ||
26 | + D[date[:10]].append(vec) | ||
27 | + else: | ||
28 | + D[date[:10]] = [vec] | ||
29 | + | ||
30 | + D2 = {} | ||
31 | + for date in sorted(D.keys()): | ||
32 | + D2[date] = np.mean(D[date], 0) | ||
33 | + | ||
34 | + Dates = np.array(sorted(D2.keys())) | ||
35 | + SampleIndex = [list(range(i-days, i)) for i in range(5, len(Dates))] | ||
36 | + DataX = [] | ||
37 | + DateX = [] | ||
38 | + for listIndex in SampleIndex: | ||
39 | + DataX.append([D2[date] for date in Dates[listIndex]]) | ||
40 | + DateX.append(Dates[listIndex[-1]]) | ||
41 | + | ||
42 | + Df = pd.read_csv(StockPricePath) | ||
43 | + LabelY = [] | ||
44 | + DataX_yesData = [] | ||
45 | + for i, date in enumerate(DateX): | ||
46 | + retu = list(Df.loc[Df["Date"] == date]["ReturnOpen"]) | ||
47 | + print(retu) | ||
48 | + if len(retu) > 0: | ||
49 | + retu = float(retu[0])*100 | ||
50 | + if retu > 0: | ||
51 | + LabelY.append([1, 0]) | ||
52 | + if retu < -0: | ||
53 | + LabelY.append([0, 1]) | ||
54 | + if retu <= 0 and retu >= -0: | ||
55 | + LabelY.append([0, 1]) | ||
56 | + DataX_yesData.append(list(DataX[i])) | ||
57 | + print(date) | ||
58 | +# else: | ||
59 | + | ||
60 | + dataX = np.array(DataX_yesData) | ||
61 | + dataY = np.array(LabelY) | ||
62 | + print("DataX:", dataX.shape) | ||
63 | + print("DataY:", dataY.shape, np.sum(dataY, 0) / np.sum(dataY)) | ||
64 | + return (dataX, dataY) | ||
65 | + | ||
66 | +if __name__ == "__main__": | ||
67 | + VectorsPath = sys.argv[1] | ||
68 | + EventPath = sys.argv[2] | ||
69 | + StockPricePath = sys.argv[3] | ||
70 | + days = int(sys.argv[5]) | ||
71 | + DataX, LabelY = main(VectorsPath, EventPath, StockPricePath, days) | ||
72 | + DataPath = sys.argv[4] | ||
73 | + np.save(arr=DataX, file=DataPath+"/DailyVector" + sys.argv[5] + ".npy") | ||
74 | + np.save(arr=LabelY, file=DataPath+"/DailyReturn" + sys.argv[5] + ".npy") |
No preview for this file type
-
Please register or login to post a comment