jsonl2parquet.py 551 Bytes
from pyarrow import json
import pyarrow.parquet as pq
import pathlib
from os import makedirs

cwd = pathlib.Path.cwd()
for idx, json_path in enumerate(cwd.glob('jsonl/*.jsonl')):
    with json_path.open('rb') as json_file:
        table = json.read_json(json_file)
        makedirs(f"parquet/{json_path.stem[:6]}", exist_ok=True)
        pq.write_table(table, f"parquet/{json_path.stem[:6]}/{json_path.stem}.parquet")
    if (idx+1) % 100 == 0:
        print(f"Converted {idx+1} files")
else:
    print(f"Finished: total {idx+1} files are converted")