김민수

Upload preparation scripts

import json
import pathlib
cwd = pathlib.Path.cwd()
glob = cwd.glob('Article/*.json')
for idx, json_path in enumerate(glob):
with json_path.open(encoding='utf-8') as json_file:
article = json.load(json_file)
document = article["document"]
with open(f"jsonl/{article['id']}.jsonl", 'w', encoding='utf-8') as write_file:
for doc in document:
metadata = doc["metadata"]
hyphenated = f"{metadata['date'][:4]}-{metadata['date'][4:6]}-{metadata['date'][6:]}"
obj = {
"author": metadata["author"],
"publisher": metadata["publisher"],
"date": hyphenated,
"topic": metadata["topic"],
"paragraph": list(map(lambda line: line["form"], doc["paragraph"]))
}
write_file.write(json.dumps(obj, ensure_ascii=False) + '\n')
if (idx+1) % 100 == 0:
print(f"Converted {idx+1} files")
else:
print(f"Finished: total {idx+1} files are converted")
from pyarrow import json
import pyarrow.parquet as pq
import pathlib
from os import makedirs
cwd = pathlib.Path.cwd()
for idx, json_path in enumerate(cwd.glob('jsonl/*.jsonl')):
with json_path.open('rb') as json_file:
table = json.read_json(json_file)
makedirs(f"parquet/{json_path.stem[:6]}", exist_ok=True)
pq.write_table(table, f"parquet/{json_path.stem[:6]}/{json_path.stem}.parquet")
if (idx+1) % 100 == 0:
print(f"Converted {idx+1} files")
else:
print(f"Finished: total {idx+1} files are converted")
import json
import pathlib
cwd = pathlib.Path.cwd()
glob = cwd.glob('Article/NWRW18*.json')
rows = []
for idx, json_path in enumerate(glob):
with json_path.open(encoding='utf-8') as json_file:
article = json.load(json_file)
document = article["document"]
for doc in document:
metadata = doc["metadata"]
obj = {
"publisher": metadata["publisher"],
"topic": metadata["topic"],
"original_topic": metadata["original_topic"]
}
rows.append(obj)
if(idx+1) % 100 == 0:
print(idx+1)
with open('topic_to_predict.json', 'w', encoding='utf-8') as dataset:
json.dump(rows, dataset, ensure_ascii=False)
print("Finished")
\ No newline at end of file
import pyarrow.parquet as pq
import pandas as pd
import matplotlib.pyplot as plt
category='NIRW19'
table = pq.read_pandas(f"parquet/{category}", columns=['author', 'publisher', 'date', 'topic'])
print(table.schema)
df = table.to_pandas()
publisher=df.groupby('publisher').count()['author']
topic = df.groupby('topic').count()['author']
month = df.groupby(pd.Grouper(key='date', freq='M')).count()['author']
plt.rc('font', family='BATANG')
pub_ax = publisher.plot(kind="bar")
plt.title(f"{category}-Publisher")
for i, v in enumerate(publisher):
pub_ax.text(i-0.25, v, v)
plt.show()
plt.title(f"{category}-Topic")
top_ax=topic.plot(kind="bar")
for i, v in enumerate(topic):
top_ax.text(i-0.25, v, v)
plt.show()
plt.title(f"{category}-Month")
month.plot()
plt.show()
plt.title(f"{category}-Publisher")
publisher.plot.pie(autopct='%1.1f%%')
plt.show()
plt.title(f"{category}-Topic")
topic.plot.pie(autopct='%1.1f%%')
plt.show()
\ No newline at end of file
This diff is collapsed. Click to expand it.