predict.py 1.48 KB
import config
from tensorflow.keras.models import load_model
from gensim.models import KeyedVectors
from file_parser import parse_keywords
import tensorflow as tf
from utils import *
import random
import numpy as np

def avg_feature_vector(text, model, num_features, index2word_set):
    words = parse_keywords(text)
    feature_vec = np.zeros((num_features,), dtype='float32')
    n_words = 0
    for word in words:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

def compare(t2v_model, model, dir1, dir2):
    files = [f for f in readdir(dir1) if is_extension(f, 'py')]
    
    plt.ylabel('cos_sim')
    m = 10
    Mx = 0
    idx = 0
    L = len(files)
    data = []
    index2word_set = set(t2v_model.index_to_key)

    for f in files:
        print(idx,"/",L)
        f2 = dir2 + f.split(dir1)[1]

        text1 = readAll(f)
        text2 = readAll(f2)

        input1 = avg_feature_vector(text1, c2v_model, 384, index2word_set)
        input2 = avg_feature_vector(text2, c2v_model, 384, index2word_set)

        data.append([[input1], [input2]])
        idx += 1

    result = model.predict(data)
    print(result)

vectors_text_path = 'data/targets.txt'
t2v_model = KeyedVectors.load_word2vec_format(vectors_text_path, binary=False)
model = load_model(config.MODEL_PATH)

# Usage
# compare(t2v_model, model, 'data/refined', 'data/shuffled')