jsonl2parquet.py
551 Bytes
from pyarrow import json
import pyarrow.parquet as pq
import pathlib
from os import makedirs
cwd = pathlib.Path.cwd()
for idx, json_path in enumerate(cwd.glob('jsonl/*.jsonl')):
with json_path.open('rb') as json_file:
table = json.read_json(json_file)
makedirs(f"parquet/{json_path.stem[:6]}", exist_ok=True)
pq.write_table(table, f"parquet/{json_path.stem[:6]}/{json_path.stem}.parquet")
if (idx+1) % 100 == 0:
print(f"Converted {idx+1} files")
else:
print(f"Finished: total {idx+1} files are converted")