code2vec_tester.py 2.61 KB

Raw Blame History Permalink

from gensim.models import KeyedVectors
import text2vec
import random
from utils import *
import matplotlib.pyplot as plt

vectors_text_path = 'data/targets.txt' # w2v output file from model
model = KeyedVectors.load_word2vec_format(vectors_text_path, binary=False)

def compare(dir1, dir2):
    files = [f for f in readdir(dir1) if is_extension(f, 'py')]

    plt.ylabel('cos_sim')
    m = 10
    Mx = 0
    idx = 0
    L = len(files)

    for f in files:
        print(idx,"/",L)
        f2 = dir2 + f.split(dir1)[1]

        text1 = readAll(f)
        text2 = readAll(f2)

        similarity = text2vec.get_similarity(text1, text2, model, 384)
        m = min(m, similarity)
        Mx = max(Mx, similarity)
        plt.plot(idx, similarity, 'r.')
        idx += 1

    print("min:", m, "max:", Mx)
    plt.show()

def compare2(path): # for merged dataset
    pairs = read_file(path + '/log.txt') # log file format: path_merged path_source1 path_source2

    plt.ylabel('cos_sim')
    m = 10
    Mx = 0
    idx = 0
    L = len(pairs)
    s1 = []
    s2 = []

    for p in pairs:
        print(idx,"/",L)
        arr = p.split(' ')
        C = path + '/' + arr[0].strip()
        A = arr[1].strip()
        B = arr[2].strip()

        text_A = readAll(A)
        text_B = readAll(B)
        text_C = readAll(C)

        similarity = text2vec.get_similarity(text_A, text_C, model, 384)
        m = min(m, similarity)
        Mx = max(Mx, similarity)
        s1.append(similarity)

        similarity = text2vec.get_similarity(text_B, text_C, model, 384)
        m = min(m, similarity)
        Mx = max(Mx, similarity)
        s2.append(similarity)
        idx += 1

    print("min:", m, "max:", Mx)
    plt.plot(s1, 'r.')
    plt.waitforbuttonpress()

    plt.cla()
    plt.plot(s2, 'b.')
    plt.show()

def compare3(dir): # for original dataset compare. (n^2 here. beware of long processing
    files = [f for f in readdir(dir) if is_extension(f, 'py')]

    plt.ylabel('cos_sim')
    m = 10
    Mx = 0
    idx = 0
    L = len(files)
    data = []

    for f in files:
        print(idx,"/",L)

        text = readAll(f)
        data.append(text)
        idx += 1

    for i in range(L):
        print(i)
        j = i
        if i == 0:
            continue
        while j == i:
            j = random.choice(list(range(i)))

        similarity = text2vec.get_similarity(data[i], data[j], model, 384)
        m = min(m, similarity)
        Mx = max(Mx, similarity)
        plt.plot(i, similarity, 'r.')

    print("min:", m, "max:", Mx)
    plt.show()

# Usage
# compare('data/refined', 'data/obfuscated2')
# compare2('data/merged')
# compare3('data/refined')