text2vec.py 800 Bytes
from file_parser import parse_keywords
import numpy as np
from scipy import spatial

def avg_feature_vector(text, model, num_features, index2word_set):
    words = parse_keywords(text)
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

def get_similarity(text1, text2, model, num_features):
    index2word_set = set(model.index_to_key)
    s1 = avg_feature_vector(text1, model, num_features, index2word_set)
    s2 = avg_feature_vector(text2, model, num_features, index2word_set)
    return abs(1 - spatial.distance.cosine(s1, s2))