file_parser.py
3.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from utils import *
import re
import keyword
LIBRARYS = list()
def parse_keywords(line, out): # out : output dictionary to sum up frequencies
line = line.strip()
line = remove_string(line)
result = ''
for c in line:
if re.match('[A-Za-z_@0-9]', c):
result += c
else:
result += ' '
import_line = False
prev_key = ''
for key in result.split(' '):
if not key or is_number(key) or key[0] in "0123456789":
continue
## Exception code here if you want
if key in ['from', 'import']:
import_line = True
if import_line and prev_key != 'as':
if not key in LIBRARYS:
LIBRARYS.append(key)
prev_key = key
continue
if key in keyword.kwlist or key in LIBRARYS or '@' in key:
prev_key = key
continue
prev_key = key
##
if not key in out:
out[key] = 1
else:
out[key] += 1
def parse_block(lines): # parse to import / def / class / normal (if, for, etc)
lines = remove_unnecessary_comments(lines)
root = Block('TYPE_ROOT') # main block tree node
block_stack = [root]
i = 0
L = len(lines)
### commented because of some exceptions
# par_stack = list()
# multi_string_stack = list()
while i < L:
line = lines[i]
start_index = 0
indent_count = 0
while True: # count indents
if line[start_index] == '\t':
start_index += 1
indent_count += 4
elif line[start_index] == ' ':
start_index += 1
indent_count += 1
else:
break
block = create_block_from_line(line)
block.setIndent(indent_count)
if block.blockType == 'TYPE_FACTORY': # for @factory proeprty exception
i += 1
temp = create_block_from_line(lines[i])
if temp.blockType == 'TYPE_CLASS':
block.addLine(lines[i])
block.blockType = 'TYPE_CLASS'
elif temp.blockType == 'TYPE_DEF':
block.addLine(lines[i])
block.blockType = 'TYPE_DEF'
else: # unknown type exception (factory single lines, or multi line code)
i -= 1 # roll back
'''
### code for multi-line string/code detection, but too many exception. (most code works well due to indent parsing)
line = lines[i]
if detect_parenthesis(line, par_stack) or detect_multi_string(line, multi_string_stack) or detect_multi_line_code(lines[i]): # code is not ended in a single line
i += 1
while detect_parenthesis(lines[i], par_stack) or detect_multi_string(lines[i], multi_string_stack) or detect_multi_line_code(lines[i]):
block.addLine(lines[i])
i += 1
block.addLine(lines[i])
'''
if indent_count == block_stack[-1].indent: # same indent -> change the block
block_stack.pop()
block_stack[-1].addBlock(block)
block_stack.append(block)
elif indent_count > block_stack[-1].indent: # block included in previous block
block_stack[-1].addBlock(block)
block_stack.append(block)
else: # block ended
while indent_count <= block_stack[-1].indent:
block_stack.pop()
block_stack[-1].addBlock(block)
block_stack.append(block)
i += 1
return root
"""
Usage
path = 'data/test.py'
f = open(path, 'r')
lines = f.readlines()
f.close()
block = parse_block(lines)
block.debug()
'''
keywords = dict()
parse_keywords(lines, keywords)
for k, v in keywords.items():
print(k,':',v)
a, b = threshold_dict(keywords, 3)
print(a)
print(b)
'''
"""
'''
d = dict()
parse_keywords('from test.library import a as x, b as y', d)
print(d)
'''