최재은

Add : Event Embedding code & data, 최종 보고서 수정

Showing 41 changed files with 984 additions and 0 deletions
This diff could not be displayed because it is too large.
This diff is collapsed. Click to expand it.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
import numpy as np
print(np.load("./resultEmbeding.pickle", allow_pickle=True))
D
\ No newline at end of file
{"class_name": "Sequential", "config": [{"class_name": "Flatten", "config": {"name": "flatten_1", "trainable": true, "batch_input_shape": [null, 5, 80], "dtype": "float32"}}, {"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "units": 512, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "rate": 0.8}}, {"class_name": "Dense", "config": {"name": "dense_2", "trainable": true, "units": 1024, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout_2", "trainable": true, "rate": 0.8}}, {"class_name": "Dropout", "config": {"name": "dropout_3", "trainable": true, "rate": 0.8}}, {"class_name": "Dense", "config": {"name": "dense_3", "trainable": true, "units": 2, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}], "keras_version": "2.0.2", "backend": "theano"}
\ No newline at end of file
{"class_name": "Sequential", "config": [{"class_name": "Conv1D", "config": {"name": "conv1d_1", "trainable": true, "batch_input_shape": [null, 5, 80], "dtype": "float32", "filters": 128, "kernel_size": [1], "strides": [1], "padding": "valid", "dilation_rate": [1], "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Conv1D", "config": {"name": "conv1d_2", "trainable": true, "filters": 128, "kernel_size": [3], "strides": [1], "padding": "same", "dilation_rate": [1], "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "MaxPooling1D", "config": {"name": "max_pooling1d_1", "trainable": true, "strides": [2], "pool_size": [2], "padding": "valid"}}, {"class_name": "Flatten", "config": {"name": "flatten_1", "trainable": true}}, {"class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "rate": 0.8}}, {"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "units": 2, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}], "keras_version": "2.0.2", "backend": "theano"}
\ No newline at end of file
{"class_name": "Sequential", "config": [{"class_name": "LSTM", "config": {"name": "lstm_1", "trainable": true, "batch_input_shape": [null, 5, 80], "dtype": "float32", "return_sequences": false, "go_backwards": false, "stateful": false, "unroll": false, "implementation": 0, "units": 256, "activation": "tanh", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": {"class_name": "L1L2", "config": {"l1": 0.0, "l2": 0.0010000000474974513}}, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0}}, {"class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "rate": 0.6}}, {"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "units": 2, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": {"class_name": "L1L2", "config": {"l1": 0.0, "l2": 0.0010000000474974513}}, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}], "keras_version": "2.0.2", "backend": "theano"}
\ No newline at end of file
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 20 23:41:51 2017
@author: red-sky
"""
import numpy as np
import theano
from theano import tensor as T
class EmbeddingLayer(object):
def __init__(self, num_vocab, word_dim, rng, embedding_w=None):
'''
word_dim :: dimension of the word embeddings
num_vocab :: number of word embeddings in the vocabulary
embedding_w :: pre-train word vector
'''
if embedding_w is None:
word_vectors = rng.uniform(-1.0, 1.0, (num_vocab, word_dim))
self.embedding_w = theano.shared(word_vectors,
name="EmbeddingLayer_W") \
.astype(theano.config.floatX)
else:
self.embedding_w = theano.shared(embedding_w,
name="EmbeddingLayer_W") \
.astype(theano.config.floatX)
self.params = [self.embedding_w]
self.infor = [num_vocab, word_dim]
def words_ind_2vec(self, index):
map_word_vectors = self.embedding_w[index]
output = T.mean(map_word_vectors, axis=0)
return output, map_word_vectors
if __name__ == "__main__":
rng = np.random.RandomState(220495)
arrWords = T.ivector("words")
EMBD = EmbeddingLayer(100, 150, rng=rng)
Word2Vec = theano.function(
inputs=[arrWords],
outputs=EMBD.words_ind_2vec(arrWords)
)
Vec = Word2Vec([1, 2, 3, 4])
Vec = Word2Vec([2, 3, 4])
print("Dim: ", Vec.shape)
print("Val: ", Vec)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 25 16:13:18 2017
@author: red-sky
"""
import theano
import numpy as np
import theano.tensor as T
from SmallUtils import createShareVar
class RoleDependentLayer(object):
def __init__(self, left_dependent, right_dependent, rng,
n_in=100, n_out=4, trainedParams=None,
name="RoleDependentEmbedding_"):
if trainedParams is None:
trainedParams = {
name: {
"T": None, "W1": None, "W2": None, "b": None
}
}
if trainedParams[name]["T"] is not None:
assert trainedParams[name]["T"].shape == (n_out, n_in, n_in)
self.T = theano.shared(value=trainedParams[name]["T"],
name=name+"T", borrow=True)
else:
self.T = createShareVar(rng=rng, name=name+"T",
factor_for_init=n_out + n_in,
dim=(n_out, n_in, n_in))
if trainedParams[name]["W1"] is not None:
assert trainedParams[name]["W1"].shape == (n_in, n_out)
self.W1 = theano.shared(value=trainedParams[name]["W1"],
name=name+"W1", borrow=True)
else:
self.W1 = createShareVar(rng=rng, name=name+"W1",
factor_for_init=n_out + n_in,
dim=(n_in, n_out))
if trainedParams[name]["W2"] is not None:
assert trainedParams[name]["W2"].shape == (n_in, n_out)
self.W2 = theano.shared(value=trainedParams[name]["W2"],
name=name+"W2", borrow=True)
else:
self.W2 = createShareVar(rng=rng, name=name+"W2",
factor_for_init=n_out + n_in,
dim=(n_in, n_out))
if trainedParams[name]["b"] is not None:
assert trainedParams[name]["b"].shape == (n_out,)
self.b = theano.shared(value=trainedParams[name]["b"],
name=name+"b", borrow=True)
else:
b_values = np.zeros(shape=(n_out,), dtype=theano.config.floatX)
self.b = theano.shared(value=b_values, name=name+"b", borrow=True)
# list of layer params
self.params = [self.T, self.W1, self.W2, self.b]
# L2 regulation
self.L2 = sum([(param**2).sum() for param in self.params])
# Bi-linear step
def one_kernel(Tk, left, right):
first_bi_libear = theano.dot(left, Tk)
seccon_bi_linear = theano.dot(first_bi_libear, right)
return(seccon_bi_linear.flatten())
bi_1, _ = theano.scan(
fn=one_kernel,
sequences=[self.T],
non_sequences=[left_dependent, right_dependent],
n_steps=n_out
)
# Feed forward network step
feedforward_step1 = theano.dot(left_dependent, self.W1)
feedforward_step2 = theano.dot(right_dependent, self.W2)
feedforward_step3 = (feedforward_step1 +
feedforward_step2.dimshuffle("x", 0) +
self.b.dimshuffle("x", 0))
feedforward_step4 = bi_1.dimshuffle(1, 0) + feedforward_step3
self.output = theano.tensor.tanh(feedforward_step4)
self.test = [feedforward_step3]
def output_(self, left_dependent, right_dependent):
def one_kernel(Tk, left, right):
first_bi_libear = theano.dot(left, Tk)
seccon_bi_linear = theano.dot(first_bi_libear, right)
return(seccon_bi_linear.flatten())
bi_linear_tensor, _ = theano.scan(
fn=one_kernel,
sequences=[self.T],
non_sequences=[left_dependent, right_dependent],
n_steps=n_out
)
bi_linear_tensor = bi_linear_tensor.dimshuffle(1, 0)
feedforward_step1 = theano.dot(left_dependent, self.W1)
feedforward_step2 = theano.dot(right_dependent, self.W2)
feedforward_step3 = (feedforward_step1 +
feedforward_step2.dimshuffle("x", 0) +
self.b.dimshuffle("x", 0))
feedforward_step4 = bi_linear_tensor + feedforward_step3
output = theano.tensor.tanh(feedforward_step4)
return(output)
def get_params(self):
trainedParams = {
"T": self.T.get_value(), "W1": self.W1.get_value(),
"W2": self.W2.get_value(), "b": self.b.get_value()
}
return(trainedParams)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 25 15:55:14 2017
@author: red-sky
"""
import theano
import theano.tensor as T
import numpy as np
def createShareVar(rng, dim, name, factor_for_init):
var_values = np.asarray(
rng.uniform(
low=-np.sqrt(6.0 / factor_for_init),
high=np.sqrt(6.0 / factor_for_init),
size=dim,
)
)
Var = theano.shared(value=var_values, name=name, borrow=True)
return Var
def adadelta(lr, tparams, cost, grads, listInput):
"""
An adaptive learning rate optimizer
Parameters
----------
lr : Theano SharedVariable
Initial learning rate
tpramas: Theano SharedVariable
Model parameters
grads: Theano variable
Gradients of cost w.r.t to parameres
cost: Theano variable
Objective fucntion to minimize
Notes
-----
For more information, see [ADADELTA]_.
.. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
Rate Method*, arXiv:1212.5701.
"""
np_float = np.asarray(0., dtype=theano.config.floatX)
zipped_grads = [theano.shared(p.get_value() * np_float,
name='%s_grad' % k)
for k, p in enumerate(tparams)]
running_up2 = [theano.shared(p.get_value() * np_float,
name='%s_rup2' % k)
for k, p in enumerate(tparams)]
running_grads2 = [theano.shared(p.get_value() * np_float,
name='%s_rgrad2' % k)
for k, p in enumerate(tparams)]
zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
for rg2, g in zip(running_grads2, grads)]
f_grad_shared = theano.function(inputs=listInput,
outputs=cost,
updates=zgup + rg2up,
name='adadelta_f_grad_shared')
updir = [-T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg
for zg, ru2, rg2 in zip(zipped_grads,
running_up2,
running_grads2)]
ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
for ru2, ud in zip(running_up2, updir)]
param_up = [(p, p + ud) for p, ud in zip(tparams, updir)]
f_update = theano.function([lr], [], updates=ru2up + param_up,
on_unused_input='ignore',
name='adadelta_f_update')
return f_grad_shared, f_update
def ADAM_OPTIMIZER(loss, all_params, learning_rate=0.001,
b1=0.9, b2=0.999, e=1e-8, gamma=1-1e-8):
"""
CITE: http://sebastianruder.com/optimizing-gradient-descent/index.html#adam
ADAM update rules
Default values are taken from [Kingma2014]
References:
[Kingma2014] Kingma, Diederik, and Jimmy Ba.
"Adam: A Method for Stochastic Optimization."
arXiv preprint arXiv:1412.6980 (2014).
http://arxiv.org/pdf/1412.6980v4.pdf
"""
updates = []
all_grads = theano.grad(loss, all_params)
alpha = learning_rate
t = theano.shared(np.float32(1))
# (Decay the first moment running average coefficient)
b1_t = b1*gamma**(t-1)
for params_previous, g in zip(all_params, all_grads):
init_moment = np.zeros(params_previous.get_value().shape,
dtype=theano.config.floatX)
# (the mean)
first_moment = theano.shared(init_moment)
# (the uncentered variance)
second_moment = theano.shared(init_moment)
# (Update biased first moment estimate)
bias_m = b1_t*first_moment + (1 - b1_t)*g
# (Update biased second raw moment estimate)
bias_v = b2*second_moment + (1 - b2)*g**2
# (Compute bias-corrected first moment estimate)
unbias_m = bias_m / (1-b1**t)
# (Compute bias-corrected second raw moment estimate)
unbias_v = bias_v / (1-b2**t)
# (Update parameters)
update_term = (alpha * unbias_m) / (T.sqrt(unbias_v) + e)
params_new = params_previous - update_term
updates.append((first_moment, bias_m))
updates.append((second_moment, bias_v))
updates.append((params_previous, params_new))
updates.append((t, t + 1.))
return updates
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 13 17:01:36 2017
@author: red-sky
"""
import sys
import numpy as np
np.random.seed(280295)
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten
from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras import regularizers, optimizers
def recall(y_true, y_pred):
"""Recall metric.
Only computes a batch-wise average of recall.
Computes the recall, a metric for multi-label classification of
how many relevant items are selected.
"""
true_positives = K.sum(K.round(K.clip(y_true[:, 1] * y_pred[:, 1], 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true[:, 1], 0, 1)))
recall = true_positives / (possible_positives + K.epsilon())
return recall
def precision(y_true, y_pred):
"""Precision metric.
Only computes a batch-wise average of precision.
Computes the precision, a metric for multi-label classification of
how many selected items are relevant.
"""
true_positives = K.sum(K.round(K.clip(y_true[:, 1] * y_pred[:, 1], 0, 1)))
predicted_positives = K.sum(K.round(K.clip(y_pred[:, 1], 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
return precision
def fbeta_score(y_true, y_pred):
# If there are no true positives, fix the F score at 0 like sklearn.
if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
return 0
p = precision(y_true, y_pred)
r = recall(y_true, y_pred)
bb = 1 ** 2
fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
return fbeta_score
def main(dataX_path, dataY_path, result_path,
n_epoch, input_dim, days):
# load data
np.random.seed(2204)
X = np.load(dataX_path)
Y = np.load(dataY_path)
# build Model
model = Sequential()
model.add(Conv1D(128, 1, activation='relu', input_shape=(days, input_dim)))
model.add(Conv1D(128, 3, activation='relu', padding='same'))
model.add(MaxPooling1D(2))
model.add(Flatten())
model.add(Dropout(0.8))
model.add(Dense(2, activation='softmax'))
adam = optimizers.Adam(lr=0.001)
model.compile(loss='categorical_crossentropy',
optimizer=adam,
metrics=['accuracy', recall, precision, fbeta_score])
# model Compile
model_name = result_path+'model2_price_move_predict.hdf5'
checkpointer = ModelCheckpoint(filepath=model_name,
monitor='val_fbeta_score',
verbose=2, save_best_only=True)
earlystopper = EarlyStopping(monitor='val_loss', patience=20, verbose=2)
outmodel = open(result_path+'model2_price_move_predict.json', 'w')
outmodel.write(model.to_json())
outmodel.close()
# process Training
model.fit(X, Y, batch_size=32, verbose=2,
validation_split=0.1, epochs=n_epoch,
callbacks=[checkpointer])
if __name__ == "__main__":
dataX = sys.argv[1]
dataY = sys.argv[2]
model_path = sys.argv[3]
n_epoch = int(sys.argv[4])
input_dim = int(sys.argv[5])
days = int(sys.argv[6])
main(dataX, dataY, model_path, n_epoch, input_dim, days)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 13 17:01:36 2017
@author: red-sky
"""
import sys
import numpy as np
np.random.seed(280295)
import keras.backend as K
from keras.models import Sequential
from keras import regularizers, optimizers
from keras.layers import Dense, Activation, LSTM, Dropout
from keras.callbacks import ModelCheckpoint, EarlyStopping
def recall(y_true, y_pred):
"""Recall metric.
Only computes a batch-wise average of recall.
Computes the recall, a metric for multi-label classification of
how many relevant items are selected.
"""
true_positives = K.sum(K.round(K.clip(y_true[:, 0] * y_pred[:, 0], 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true[:, 0], 0, 1)))
recall = true_positives / (possible_positives + K.epsilon())
return recall
def precision(y_true, y_pred):
"""Precision metric.
Only computes a batch-wise average of precision.
Computes the precision, a metric for multi-label classification of
how many selected items are relevant.
"""
true_positives = K.sum(K.round(K.clip(y_true[:, 0] * y_pred[:, 0], 0, 1)))
predicted_positives = K.sum(K.round(K.clip(y_pred[:, 0], 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
return precision
def fbeta_score(y_true, y_pred):
# If there are no true positives, fix the F score at 0 like sklearn.
if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
return 0
p = precision(y_true, y_pred)
r = recall(y_true, y_pred)
bb = 1 ** 2
fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
return fbeta_score
def main(dataX_path, dataY_path, result_path,
n_epoch, input_dim, days):
# load data
np.random.seed(2204)
X = np.load(dataX_path)
Y = np.load(dataY_path)
# build Model
model = Sequential()
model.add(LSTM(256, input_shape=(days, input_dim),
kernel_regularizer=regularizers.l2(0.001)))
model.add(Dropout(0.6))
model.add(Dense(2, activation='softmax',
kernel_regularizer=regularizers.l2(0.001)))
adam = optimizers.Adam(lr=0.001)
model.compile(loss='categorical_crossentropy',
optimizer=adam,
metrics=['accuracy', recall, precision, fbeta_score])
# model Compile
model_name = result_path+'model2_price_move_predict.hdf5'
checkpointer = ModelCheckpoint(filepath=model_name,
monitor='val_fbeta_score', mode="max",
verbose=2, save_best_only=True)
earlystopper = EarlyStopping(monitor='val_loss', patience=20, verbose=2)
outmodel = open(result_path+'model2_price_move_predict.json', 'w')
outmodel.write(model.to_json())
outmodel.close()
# process Training
model.fit(X, Y, batch_size=32, verbose=2,
validation_split=0.1, epochs=n_epoch,
callbacks=[checkpointer])
if __name__ == "__main__":
dataX = sys.argv[1]
dataY = sys.argv[2]
model_path = sys.argv[3]
n_epoch = int(sys.argv[4])
input_dim = int(sys.argv[5])
days = int(sys.argv[6])
main(dataX, dataY, model_path, n_epoch, input_dim, days)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 13 17:01:36 2017
@author: red-sky
"""
import sys
import numpy as np
np.random.seed(280295)
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten
from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras import regularizers, optimizers
def recall(y_true, y_pred):
"""Recall metric.
Only computes a batch-wise average of recall.
Computes the recall, a metric for multi-label classification of
how many relevant items are selected.
"""
true_positives = K.sum(K.round(K.clip(y_true[:, 0] * y_pred[:, 0], 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true[:, 0], 0, 1)))
recall = true_positives / (possible_positives + K.epsilon())
return recall
def precision(y_true, y_pred):
"""Precision metric.
Only computes a batch-wise average of precision.
Computes the precision, a metric for multi-label classification of
how many selected items are relevant.
"""
true_positives = K.sum(K.round(K.clip(y_true[:, 0] * y_pred[:, 0], 0, 1)))
predicted_positives = K.sum(K.round(K.clip(y_pred[:, 0], 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
return precision
def fbeta_score(y_true, y_pred):
# If there are no true positives, fix the F score at 0 like sklearn.
if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
return 0
p = precision(y_true, y_pred)
r = recall(y_true, y_pred)
bb = 1 ** 2
fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
return fbeta_score
def main(dataX_path, dataY_path, result_path,
n_epoch, input_dim, days):
# load data
np.random.seed(2204)
X = np.load(dataX_path)
Y = np.load(dataY_path)
# build Model
model = Sequential()
model.add(Flatten(input_shape=(days, input_dim)))
model.add(Dense(512, activation='sigmoid'))
model.add(Dropout(0.8))
model.add(Dense(1024, activation='sigmoid'))
model.add(Dropout(0.8))
# model.add(Dense(1024, activation='sigmoid'))
model.add(Dropout(0.8))
model.add(Dense(2, activation='softmax'))
adam = optimizers.Adam(lr=0.001)
model.compile(loss='categorical_crossentropy',
optimizer=adam,
metrics=['accuracy', recall, precision, fbeta_score])
# model Compile
model_name = result_path+'model2_price_move_predict.hdf5'
checkpointer = ModelCheckpoint(filepath=model_name, monitor='val_acc',
verbose=2, save_best_only=True)
earlystopper = EarlyStopping(monitor='val_loss', patience=20, verbose=2)
outmodel = open(result_path+'model2_price_move_predict.json', 'w')
outmodel.write(model.to_json())
outmodel.close()
# process Training
model.fit(X, Y, batch_size=32, verbose=2,
validation_split=0.1, epochs=n_epoch,
callbacks=[checkpointer])
if __name__ == "__main__":
dataX = sys.argv[1]
dataY = sys.argv[2]
model_path = sys.argv[3]
n_epoch = int(sys.argv[4])
input_dim = int(sys.argv[5])
days = int(sys.argv[6])
main(dataX, dataY, model_path, n_epoch, input_dim, days)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 16 21:57:57 2017
@author: red-sky
"""
import bs4
import json
import sys
import urllib.request as urlreq
from bs4 import BeautifulSoup
import requests
BLOOMBERG_params = {
"sort_by_newest": "time:desc",
"sort_by_oldest": "time:asc",
"source_from_bloomberg": "sites=bview",
"end_time": "2017-03-12T15:20:16.240Z"
}
DATA_TO_EXTRACT = {
"query_list_news": ["div", {"class": "search-result-story__container"}],
"query_headline": ["h1", {"class": "search-result-story__headline"}],
"query_time_published": ["time", {"class": "published-at"}],
"query_body": ["div", {"class": "search-result-story__body"}]
}
def parser_url(query_string, page,
sort_by="sort_by_oldest",
source="source_from_bloomberg"):
url = "https://www.bloomberg.com/"
# add search query
url = url + "search?query=" + query_string + "&"
# add sort by
url = url + "sort=" + BLOOMBERG_params[sort_by] + "&"
# add time to query -- use present time
url = url + "sites=" + BLOOMBERG_params[source] + "&"
# add page number
url = url + "page=" + str(page)
return url
def get_rid_off_key(list_contents):
body_string = ""
for substring in list_contents:
if (type(substring) == bs4.element.Tag):
# join all body string and
# eliminate highlight query string key
body_string += substring.string
else:
if (type(substring.string) == bs4.element.NavigableString):
body_string += substring.string
return(body_string)
def extract_from_url(url):
try:
with requests.get(url) as response:
html_of_page = response.read()
soup_object = BeautifulSoup(html_of_page, "lxml")
# Extract list of news in soup object
param_to_find = DATA_TO_EXTRACT["query_list_news"]
list_of_news = soup_object.find_all(param_to_find[0],
attrs=param_to_find[1])
if (len(list_of_news) == 0):
return None
# create list result extracted
result = []
for block_new in list_of_news:
# extract time from block
param_to_find = DATA_TO_EXTRACT["query_time_published"]
time = block_new.find_all(param_to_find[0],
attrs=param_to_find[1])
time = time[0]["datetime"]
# extract new headline
param_to_find = DATA_TO_EXTRACT["query_headline"]
headline = block_new.find_all(param_to_find[0],
attrs=param_to_find[1])
headline = get_rid_off_key(headline[0].a.contents)
# extract new body list if string
param_to_find = DATA_TO_EXTRACT["query_body"]
body = block_new.find_all(param_to_find[0],
attrs=param_to_find[1])
print(body)
body_string = get_rid_off_key(body[0].contents)
extracted_from_block = {"time": time,
"headline": headline,
"body": body_string}
# for debug :
# print("\t".join(extracted_from_block))
if len(body_string) >= 5:
result.append(extracted_from_block)
except Exception as inst:
print("Something whenwrong :)", inst)
print("ULR: ", url)
result = []
return(result)
def Query(key, max_page=5000):
# Init page and looping until return None
page = 1
result = "not None"
all_result_query = []
error = 0
while True and page < max_page:
print("Colected: %d articles" % len(all_result_query))
new_url = parser_url(key, page)
result = extract_from_url(new_url)
if len(result) > 0 or error > 10:
page += 1
error = 0
else:
error += 1
if result is not None:
all_result_query += result
else:
break
return(all_result_query)
if __name__ == "__main__":
print("Begin query information about: ", sys.argv[1])
print("Then will save result in: ", sys.argv[2])
News = Query(sys.argv[1], int(sys.argv[4]))
file_name1 = sys.argv[2]
with open(file_name1, "w") as W:
json.dump(News, W, indent=1)
file_name2 = sys.argv[3]
with open(file_name2, "w") as W:
W.write("\n".join([new["body"] for new in News]))
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 20 17:52:11 2017
@author: red-sky
"""
import sys
import json
import numpy as np
def updateDict(words, dictUp):
# update word dictionary with given "words" and the dict "dictUp"
for w in words:
if w in dictUp:
dictUp[w] += 1
else:
dictUp[w] = 0
return dictUp
def extractVocab(eventsFile, fromIndex=0, toIndex="END"):
# from Events file, extract infor about words and create a mapping
vocab = dict()
with open(eventsFile, "r") as file:
list_events = file.read().strip().splitlines()
if toIndex == -1:
list_events = list_events[fromIndex:]
else:
list_events = sorted(set(list_events[fromIndex:toIndex]))
for i, event in enumerate(list_events):
if event[0] != "\t":
index = i
break
list_events = list_events[index:]
for event in list_events:
event = event.split("\t")
words = event[1].split(" ") + \
event[2].split(" ") + \
event[3].split(" ")
vocab = updateDict(words, vocab)
vocab_words = vocab.keys()
support_words = ["NOISEWORDS"]
vocab_words = support_words + \
sorted(vocab_words, key=lambda x: vocab[x], reverse=True)
IndexWords = range(len(vocab_words))
Count = ["NOISEWORDS"] + [vocab[w] for w in vocab_words[1:]]
result = [dict(zip(vocab_words, Count)),
dict(zip(IndexWords, vocab_words)),
dict(zip(vocab_words, IndexWords))]
return result, list_events
def convertEvent(eventsFile, vocabMapping, countMin=5):
# convert all Events to index for training
wordCount, _, word2index = vocabMapping
Events = []
with open(eventsFile, "r") as file:
list_events = file.read().strip().splitlines()
for event in list_events:
event = event.split("\t")
list_obj = [event[1].split(" "),
event[2].split(" "),
event[3].split(" ")]
# Covert only words that appear more than countMin
wordsIndexed = []
for obj in list_obj:
objIndex = []
for w in obj:
if wordCount[w] >= countMin:
objIndex.append(word2index[w])
else:
objIndex.append(0)
wordsIndexed.append(objIndex)
Events.append(wordsIndexed)
return Events
if __name__ == "__main__":
# in
EventPath = "../../Thesis_data/Apple_query_result_body.txt"
fromIndex = 0
toIndex = -1
minCountWord = 5
# out
EventNewPath = "./Events_for_training.txt"
VocabPath = "./Vocab_in_events_for_training.json"
IndexdEventPath = "./IndexedEvents_for_training.npy"
vocabMapping, EventNew = extractVocab(EventPath, fromIndex, toIndex)
with open(VocabPath, "w") as W:
json.dump(vocabMapping, W, indent=2)
with open(EventNewPath, "w") as W:
W.write("\n".join(EventNew))
indexed_events = convertEvent(EventNewPath, vocabMapping, minCountWord)
np.save(arr=np.array(indexed_events), file=IndexdEventPath)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 20 11:58:54 2017
@author: red-sky
"""
import sys
import json
def findDate(news_body, list_news):
date = ""
for ind, new in enumerate(list_news):
if news_body in new["body"]:
date = new["time"]
break
return date
def extractAllDate(list_events, list_news, choosedInfor=[1, 2, 3, 0, 6]):
list_result = []
N = len(list_events)
i = 0.0
for event in list_events:
i += 1
if i % 1000 == 0:
print("Done %f percents" % (i/N*100))
date = [findDate(event[6], list_news)]
infor = date + [event[i] for i in choosedInfor]
list_result.append(infor)
return list_result
if __name__ == "__main__":
events = open(sys.argv[1], "r").read().strip().splitlines()
events = [event.split("\t") for event in events
if len(event.split("\t")) > 5]
news = json.load(open(sys.argv[2], "r"))
result = extractAllDate(events, news)
with open(sys.argv[3], "w") as W:
for line in result[1:]:
W.write("\t".join(line)+"\n")
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 13 16:57:11 2017
@author: red-sky
"""
import sys
import numpy as np
import pickle
import pandas as pd
def main(VectorsPath, EventPath, StockPricePath, days):
with open(VectorsPath, "rb") as H:
Vec = pickle.load(H)
Vectors = np.array([list(b[0]) for a, b in Vec.values()])
# Vectors = np.load(VectorsPath)
with open(EventPath, "r") as H:
F = np.array([a.split("\t")[0:4] for a in H.read().splitlines()])
D = {}
for date, vec in zip(F[:, 0], Vectors):
if date[:10] in D:
D[date[:10]].append(vec)
else:
D[date[:10]] = [vec]
D2 = {}
for date in sorted(D.keys()):
D2[date] = np.mean(D[date], 0)
Dates = np.array(sorted(D2.keys()))
SampleIndex = [list(range(i-days, i)) for i in range(5, len(Dates))]
DataX = []
DateX = []
for listIndex in SampleIndex:
DataX.append([D2[date] for date in Dates[listIndex]])
DateX.append(Dates[listIndex[-1]])
Df = pd.read_csv(StockPricePath)
LabelY = []
DataX_yesData = []
for i, date in enumerate(DateX):
retu = list(Df.loc[Df["Date"] == date]["ReturnOpen"])
print(retu)
if len(retu) > 0:
retu = float(retu[0])*100
if retu > 0:
LabelY.append([1, 0])
if retu < -0:
LabelY.append([0, 1])
if retu <= 0 and retu >= -0:
LabelY.append([0, 1])
DataX_yesData.append(list(DataX[i]))
print(date)
# else:
dataX = np.array(DataX_yesData)
dataY = np.array(LabelY)
print("DataX:", dataX.shape)
print("DataY:", dataY.shape, np.sum(dataY, 0) / np.sum(dataY))
return (dataX, dataY)
if __name__ == "__main__":
VectorsPath = sys.argv[1]
EventPath = sys.argv[2]
StockPricePath = sys.argv[3]
days = int(sys.argv[5])
DataX, LabelY = main(VectorsPath, EventPath, StockPricePath, days)
DataPath = sys.argv[4]
np.save(arr=DataX, file=DataPath+"/DailyVector" + sys.argv[5] + ".npy")
np.save(arr=LabelY, file=DataPath+"/DailyReturn" + sys.argv[5] + ".npy")