make_topic_dataset.py 763 Bytes
import json
import pathlib

cwd = pathlib.Path.cwd()
glob = cwd.glob('Article/NWRW18*.json')
rows = []
for idx, json_path in enumerate(glob):
    with json_path.open(encoding='utf-8') as json_file:
        article = json.load(json_file)
        document = article["document"]
        for doc in document:
            metadata = doc["metadata"]
            obj = {
                    "publisher": metadata["publisher"],
                    "topic": metadata["topic"],
                    "original_topic": metadata["original_topic"]
                }
            rows.append(obj)
    if(idx+1) % 100 == 0:
        print(idx+1)
with open('topic_to_predict.json', 'w', encoding='utf-8') as dataset:
    json.dump(rows, dataset, ensure_ascii=False)
print("Finished")