dataClean.js 2.3 KB
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
    return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
var strip_bom_1 = __importDefault(require("strip-bom"));
/**
 * For each data chunk coming to parser:
 * 1. append the data to the buffer that is left from last chunk
 * 2. check if utf8 chars being split, if does, stripe the bytes and add to left buffer.
 * 3. stripBom
 */
function prepareData(chunk, runtime) {
    var workChunk = concatLeftChunk(chunk, runtime);
    runtime.csvLineBuffer = undefined;
    var cleanCSVString = cleanUtf8Split(workChunk, runtime).toString("utf8");
    if (runtime.started === false) {
        return strip_bom_1.default(cleanCSVString);
    }
    else {
        return cleanCSVString;
    }
}
exports.prepareData = prepareData;
/**
 *  append data to buffer that is left form last chunk
 */
function concatLeftChunk(chunk, runtime) {
    if (runtime.csvLineBuffer && runtime.csvLineBuffer.length > 0) {
        return Buffer.concat([runtime.csvLineBuffer, chunk]);
    }
    else {
        return chunk;
    }
}
/**
 * check if utf8 chars being split, if does, stripe the bytes and add to left buffer.
 */
function cleanUtf8Split(chunk, runtime) {
    var idx = chunk.length - 1;
    /**
     * From Keyang:
     * The code below is to check if a single utf8 char (which could be multiple bytes) being split.
     * If the char being split, the buffer from two chunk needs to be concat
     * check how utf8 being encoded to understand the code below.
     * If anyone has any better way to do this, please let me know.
     */
    if ((chunk[idx] & 1 << 7) != 0) {
        while ((chunk[idx] & 3 << 6) === 128) {
            idx--;
        }
        idx--;
    }
    if (idx != chunk.length - 1) {
        runtime.csvLineBuffer = chunk.slice(idx + 1);
        return chunk.slice(0, idx + 1);
        // var _cb=cb;
        // var self=this;
        // cb=function(){
        //   if (self._csvLineBuffer){
        //     self._csvLineBuffer=Buffer.concat([bufFromString(self._csvLineBuffer,"utf8"),left]);
        //   }else{
        //     self._csvLineBuffer=left;
        //   }
        //   _cb();
        // }
    }
    else {
        return chunk;
    }
}
//# sourceMappingURL=dataClean.js.map