make_topic_dataset.py
763 Bytes
import json
import pathlib
cwd = pathlib.Path.cwd()
glob = cwd.glob('Article/NWRW18*.json')
rows = []
for idx, json_path in enumerate(glob):
with json_path.open(encoding='utf-8') as json_file:
article = json.load(json_file)
document = article["document"]
for doc in document:
metadata = doc["metadata"]
obj = {
"publisher": metadata["publisher"],
"topic": metadata["topic"],
"original_topic": metadata["original_topic"]
}
rows.append(obj)
if(idx+1) % 100 == 0:
print(idx+1)
with open('topic_to_predict.json', 'w', encoding='utf-8') as dataset:
json.dump(rows, dataset, ensure_ascii=False)
print("Finished")