file_parser.py 3.98 KB
from utils import *
import re
import keyword

LIBRARYS = list()

def parse_keywords(line, out): # out : output dictionary to sum up frequencies
    line = line.strip()
    line = remove_string(line)
    result = ''

    for c in line:
        if re.match('[A-Za-z_@0-9]', c):
            result += c
        else:
            result += ' '

    import_line = False
    prev_key = ''

    for key in result.split(' '):
        if not key or is_number(key) or key[0] in "0123456789":
            continue

        ## Exception code here if you want

        if key in ['from', 'import']:
            import_line = True          

        if import_line and prev_key != 'as':
            if not key in LIBRARYS:
                LIBRARYS.append(key)
            prev_key = key
            continue

        if key in keyword.kwlist or key in LIBRARYS or '@' in key:
            prev_key = key
            continue

        prev_key = key
        
        ##

        if not key in out:
            out[key] = 1
        else:
            out[key] += 1

def parse_block(lines): # parse to import / def / class / normal (if, for, etc)
    lines = remove_unnecessary_comments(lines)
    root = Block('TYPE_ROOT') # main block tree node
    block_stack = [root]
    i = 0
    L = len(lines)
	
	### commented because of some exceptions
    # par_stack = list()
    # multi_string_stack = list()

    while i < L:
        line = lines[i]
        start_index = 0
        indent_count = 0

        while True: # count indents
            if line[start_index] == '\t':
                start_index += 1
                indent_count += 4
            elif line[start_index] == ' ':
                start_index += 1
                indent_count += 1
            else:
                break

        block = create_block_from_line(line)
        block.setIndent(indent_count)

        if block.blockType == 'TYPE_FACTORY': # for @factory proeprty exception
            i += 1

            temp = create_block_from_line(lines[i])
            if temp.blockType == 'TYPE_CLASS':
                block.addLine(lines[i])
                block.blockType = 'TYPE_CLASS'
            elif temp.blockType == 'TYPE_DEF':
                block.addLine(lines[i])
                block.blockType = 'TYPE_DEF'
            else: # unknown type exception (factory single lines, or multi line code)
                i -= 1 # roll back

        '''
        ### code for multi-line string/code detection, but too many exception. (most code works well due to indent parsing)
        line = lines[i]
        if detect_parenthesis(line, par_stack) or detect_multi_string(line, multi_string_stack) or detect_multi_line_code(lines[i]): # code is not ended in a single line
            i += 1
            while detect_parenthesis(lines[i], par_stack) or detect_multi_string(lines[i], multi_string_stack) or detect_multi_line_code(lines[i]):
                block.addLine(lines[i])
                i += 1
            
            block.addLine(lines[i])
        '''

        if indent_count == block_stack[-1].indent: # same indent -> change the block
            block_stack.pop()
            block_stack[-1].addBlock(block)
            block_stack.append(block)
        elif indent_count > block_stack[-1].indent: # block included in previous block
            block_stack[-1].addBlock(block)
            block_stack.append(block)
        else: # block ended
            while indent_count <= block_stack[-1].indent:
                block_stack.pop()
            block_stack[-1].addBlock(block)
            block_stack.append(block)
        i += 1

    return root


"""
    Usage

    path = 'data/test.py'
    f = open(path, 'r')
    lines = f.readlines()
    f.close()

    
    block = parse_block(lines)
    block.debug()
    

    '''
    keywords = dict()
    parse_keywords(lines, keywords)

    for k, v in keywords.items():
        print(k,':',v)

    a, b = threshold_dict(keywords, 3)

    print(a)
    print(b)
    '''
"""

'''
d = dict()
parse_keywords('from test.library import a as x, b as y', d)
print(d)
'''