data_merger.py
1.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from utils import *
import file_parser
import random
def merge_two_files(input, output): # pick two random files from input, merge and shuffle codes, print to output
ori_files = [f for f in readdir(input) if is_extension(f, 'py')]
files = ori_files.copy()
random.shuffle(files)
os.makedirs(output, exist_ok=True) # create the output directory if not exists
log = open(os.path.join(output, 'log.txt'), 'w', encoding='utf8')
index = 1
while len(files) > 0:
if len(files) == 1:
one = random.choice(ori_files)
while one == files[0]: # why python doesn't have do while loop??
one = random.choice(ori_files)
pick = [files[0], one]
else:
pick = files[:2]
files = files[2:]
lines1 = read_file(pick[0])
lines2 = read_file(pick[1])
print("Merging:", pick[0], pick[1])
block1 = file_parser.parse_block(lines1)
block2 = file_parser.parse_block(lines2)
for b in block2.blocks:
block1.addBlock(b)
shuffle_block(block1)
write_block(os.path.join(output, '{}.py'.format(index)), block1)
log.write('{}.py {} {}\n'.format(index, pick[0], pick[1]))
index += 1
log.close()
print("Done generating Merged Dataset")
print("log.txt generated in output path, for merged file info. [merge_file_name file1 file2]")
'''
Usage: merge_two_files('data/original', 'data/merged')
'''