data_obfuscator.py 3.87 KB

Raw Blame History Permalink

from utils import *
import file_parser
import re

# obfuscator v1 uses names from other methods (shuffles method names)

def detect_vars(line): # detect variables and return range tuples. except for keywords
    ret = list()
    s = 0
    e = 0
    detected = False
    strException = False
    strCh = None
    line += ' ' # for last separator

    for i in range(len(line)):
        c = line[i]

        if not strException and (c == "'" or c == '"'): # we cannot remove string first, because index gets changed
            strCh = c
            strException = True
            continue

        if strException:
            if c == strCh:
                strException = False
            continue

        if not detected and re.match('[A-Za-z_]', c):
            detected = True
            s = i
            continue

        if detected and not re.match('[A-Za-z_0-9]', c):
            detected = False
            e = i
            ret.append((s, e))

    return ret

def obfuscate(lines, vars, dictionary, mapper): # obfuscate the code
    ret = list()

    for line in lines:
        var_ranges = detect_vars(line)
        var_ranges = [(s, e) for (s, e) in var_ranges if line[s:e] in vars] # remove keywords (do not convert to words because of string exception)
        var_ranges.append((-1, -1)) # for out-of-range exception

        var_index = 0
        new_line = ''
        i = 0
        L = len(line)

        while i < L:
            if i == var_ranges[var_index][0]: # found var
                s, e = var_ranges[var_index]
                new_line += vars[mapper[dictionary[line[s:e]]]]
                i = e
                var_index += 1
            else:
                new_line += line[i]
                i += 1

        ret.append(new_line)

    return ret

def create_var_histogram(input, outPath):
    files = [f for f in readdir(input) if is_extension(f, 'py')]
    freq_dict = dict()

    for p in files:
        lines = read_file(p)
        lines = remove_unnecessary_comments(lines)

        for line in lines:
            file_parser.parse_keywords(line, freq_dict)

    hist = open(outPath, 'w', encoding='utf8')
    arr = sorted(freq_dict.items(), key=select_value)
    for i in arr:
        hist.write(str(i) + '\n')
    hist.close()

def read_histogram(inputPath):
    lines = read_file(inputPath)
    ret = []

    for line in lines:
        line = line.split("'")[1]
        ret.append(line)
    return ret

def obfuscate_files(input, output, var=None, threshold=4000): # obfuscate variables. Guessing variable names from keyword frequency (threshold) if variable list is not given
    files = [f for f in readdir(input) if is_extension(f, 'py')]
    freq_dict = dict()
    codes = list()

    for p in files:
        lines = read_file(p)

        lines = remove_unnecessary_comments(lines) # IMPORTANT: remove comments from lines for preprocessing
        codes.append((p, lines))

        if var == None:
            for line in lines:
                file_parser.parse_keywords(line, freq_dict)


    if var == None: # don't have variable list
        hist = open(os.path.join(output, 'log.txt'), 'w', encoding='utf8')
        arr = sorted(freq_dict.items(), key=select_value)
        for i in arr:
            hist.write(str(i) + '\n')
        hist.close()

        var, _ = threshold_dict(freq_dict, threshold)
        var = [v[0] for v in var]

    dictionary = create_dictionary(var)
    mapper = create_mapper(len(var))

    for path, code in codes:
        obfuscated = obfuscate(code, var, dictionary, mapper)

        filepath = path.split(input)[1][1:]
        os.makedirs(os.path.join(output, filepath.split('\\')[0]), exist_ok=True) # create the output directory if not exists
        new_path = os.path.join(output, filepath)
        write_file(new_path, obfuscated)

    print("Done generating Obfuscated Dataset")


'''
Usage
obfuscate_files('data/original', 'data/obfuscated')
'''