data_obfuscator_v2.py 4.69 KB
from utils import *
import file_parser
import re

# obfuscator v2 generate random name for methods

def random_character(start=False):
    if start:
        x = random.randint(0, 52)
        if x == 0:
            return '_'
        elif x <= 26:
            return chr(65 + x - 1)
        else:
            return chr(97 + x - 27)

    x = random.randint(0, 62)
    if x == 0:
        return '_'
    elif x <= 26:
        return chr(65 + x - 1)
    elif x <= 52:
        return chr(97 + x - 27)
    else:
        return str(x - 53)

    
def create_mapper_v2(L):
    ret = []
    while len(ret) < L:
        length = random.randint(0, 8) + 4
        s = random_character(True)

        while len(s) < length:
            s += random_character()

        if not s in ret:
            ret.append(s)

    return ret

def detect_vars(line): # detect variables and return range tuples. except for keywords
    ret = list()
    s = 0
    e = 0
    detected = False
    strException = False
    strCh = None
    line += ' ' # for last separator

    for i in range(len(line)):
        c = line[i]

        if not strException and (c == "'" or c == '"'): # we cannot remove string first, because index gets changed
            strCh = c
            strException = True
            continue
        
        if strException:
            if c == strCh:
                strException = False
            continue

        if not detected and re.match('[A-Za-z_]', c):
            detected = True
            s = i
            continue

        if detected and not re.match('[A-Za-z_0-9]', c):
            detected = False
            e = i
            ret.append((s, e))

    return ret

def obfuscate(lines, vars, dictionary, mapper): # obfuscate the code
    ret = list()
    ### write_file('D:/Develop/ori.py', lines)

    for line in lines:
        var_ranges = detect_vars(line)
        var_ranges = [(s, e) for (s, e) in var_ranges if line[s:e] in vars] # remove keywords (do not convert to words because of string exception)
        var_ranges.append((-1, -1)) # for out-of-range exception

        var_index = 0
        new_line = ''
        i = 0
        L = len(line)

        while i < L:
            if i == var_ranges[var_index][0]: # found var
                s, e = var_ranges[var_index]
                new_line += mapper[dictionary[line[s:e]]]
                i = e
                var_index += 1
            else:
                new_line += line[i]
                i += 1

        ret.append(new_line)

    ### write_file('D:/Develop/obf.py', ret)
    return ret

def create_var_histogram(input, outPath):
    files = [f for f in readdir(input) if is_extension(f, 'py')]
    freq_dict = dict()

    for p in files:
        lines = read_file(p)
        lines = remove_unnecessary_comments(lines)

        for line in lines:
            file_parser.parse_keywords(line, freq_dict)

    hist = open(outPath, 'w', encoding='utf8')
    arr = sorted(freq_dict.items(), key=select_value)    
    for i in arr:
        hist.write(str(i) + '\n')
    hist.close()
    
def read_histogram(inputPath):
    lines = read_file(inputPath)
    ret = []

    for line in lines:
        line = line.split("'")[1]
        ret.append(line)
    return ret
    
def obfuscate_files(input, output, var=None, threshold=4000): # obfuscate variables. Guessing variable names from keyword frequency (threshold) if variable list is not given
    files = [f for f in readdir(input) if is_extension(f, 'py')]
    freq_dict = dict()
    codes = list()

    for p in files:
        lines = read_file(p)

        lines = remove_unnecessary_comments(lines) # IMPORTANT: remove comments from lines for preprocessing
        codes.append((p, lines))

        if var == None:
            for line in lines:
                file_parser.parse_keywords(line, freq_dict)


    if var == None: # don't have variable list
        hist = open(os.path.join(output, 'log.txt'), 'w', encoding='utf8')
        arr = sorted(freq_dict.items(), key=select_value)    
        for i in arr:
            hist.write(str(i) + '\n')
        hist.close()

        var, _ = threshold_dict(freq_dict, threshold)
        var = [v[0] for v in var]
    
    dictionary = create_dictionary(var)
    mapper = create_mapper_v2(len(var))

    ### obfuscate(codes[0][1], var, dictionary, mapper)

    for path, code in codes:
        obfuscated = obfuscate(code, var, dictionary, mapper)

        filepath = path.split(input)[1][1:]
        os.makedirs(os.path.join(output, filepath.split('\\')[0]), exist_ok=True) # create the output directory if not exists
        new_path = os.path.join(output, filepath)
        write_file(new_path, obfuscated)

    print("Done generating Obfuscated Dataset")


'''
Usage
obfuscate_files('data/original', 'data/obfuscated')
'''