dump.js 5.62 KB
/**
 *
 * Dump data to a big JSON tree
 *
 * @fileoverview dump data to JSON tree
 * @author Kevin Kwok <antimatter15@gmail.com>
 * @author Guillermo Webster <gui@mit.edu>
 * @author Jerome Wu <jeromewus@gmail.com>
 */

/**
 * deindent
 *
 * The generated HOCR is excessively indented, so
 * we get rid of that indentation
 *
 * @name deindent
 * @function deindent string
 * @access public
 */
const deindent = (html) => {
  const lines = html.split('\n');
  if (lines[0].substring(0, 2) === '  ') {
    for (let i = 0; i < lines.length; i += 1) {
      if (lines[i].substring(0, 2) === '  ') {
        lines[i] = lines[i].slice(2);
      }
    }
  }
  return lines.join('\n');
};

/**
 * dump
 *
 * @name dump
 * @function dump recognition result to a JSON object
 * @access public
 */
module.exports = (TessModule, api, {
  tessjs_create_hocr,
  tessjs_create_tsv,
  tessjs_create_box,
  tessjs_create_unlv,
  tessjs_create_osd,
}) => {
  const ri = api.GetIterator();
  const {
    RIL_BLOCK,
    RIL_PARA,
    RIL_TEXTLINE,
    RIL_WORD,
    RIL_SYMBOL,
  } = TessModule;
  const blocks = [];
  let block;
  let para;
  let textline;
  let word;
  let symbol;

  const enumToString = (value, prefix) => (
    Object.keys(TessModule)
      .filter((e) => (e.startsWith(`${prefix}_`) && TessModule[e] === value))
      .map((e) => e.slice(prefix.length + 1))[0]
  );

  ri.Begin();
  do {
    if (ri.IsAtBeginningOf(RIL_BLOCK)) {
      const poly = ri.BlockPolygon();
      let polygon = null;
      // BlockPolygon() returns null when automatic page segmentation is off
      if (TessModule.getPointer(poly) > 0) {
        const n = poly.get_n();
        const px = poly.get_x();
        const py = poly.get_y();
        polygon = [];
        for (let i = 0; i < n; i += 1) {
          polygon.push([px.getValue(i), py.getValue(i)]);
        }
        /*
         * TODO: find out why _ptaDestroy doesn't work
         */
        // TessModule._ptaDestroy(TessModule.getPointer(poly));
      }

      block = {
        paragraphs: [],
        text: ri.GetUTF8Text(RIL_BLOCK),
        confidence: ri.Confidence(RIL_BLOCK),
        baseline: ri.getBaseline(RIL_BLOCK),
        bbox: ri.getBoundingBox(RIL_BLOCK),
        blocktype: enumToString(ri.BlockType(), 'PT'),
        polygon,
      };
      blocks.push(block);
    }
    if (ri.IsAtBeginningOf(RIL_PARA)) {
      para = {
        lines: [],
        text: ri.GetUTF8Text(RIL_PARA),
        confidence: ri.Confidence(RIL_PARA),
        baseline: ri.getBaseline(RIL_PARA),
        bbox: ri.getBoundingBox(RIL_PARA),
        is_ltr: !!ri.ParagraphIsLtr(),
      };
      block.paragraphs.push(para);
    }
    if (ri.IsAtBeginningOf(RIL_TEXTLINE)) {
      textline = {
        words: [],
        text: ri.GetUTF8Text(RIL_TEXTLINE),
        confidence: ri.Confidence(RIL_TEXTLINE),
        baseline: ri.getBaseline(RIL_TEXTLINE),
        bbox: ri.getBoundingBox(RIL_TEXTLINE),
      };
      para.lines.push(textline);
    }
    if (ri.IsAtBeginningOf(RIL_WORD)) {
      const fontInfo = ri.getWordFontAttributes();
      const wordDir = ri.WordDirection();
      word = {
        symbols: [],
        choices: [],

        text: ri.GetUTF8Text(RIL_WORD),
        confidence: ri.Confidence(RIL_WORD),
        baseline: ri.getBaseline(RIL_WORD),
        bbox: ri.getBoundingBox(RIL_WORD),

        is_numeric: !!ri.WordIsNumeric(),
        in_dictionary: !!ri.WordIsFromDictionary(),
        direction: enumToString(wordDir, 'DIR'),
        language: ri.WordRecognitionLanguage(),

        is_bold: fontInfo.is_bold,
        is_italic: fontInfo.is_italic,
        is_underlined: fontInfo.is_underlined,
        is_monospace: fontInfo.is_monospace,
        is_serif: fontInfo.is_serif,
        is_smallcaps: fontInfo.is_smallcaps,
        font_size: fontInfo.pointsize,
        font_id: fontInfo.font_id,
        font_name: fontInfo.font_name,
      };
      const wc = new TessModule.WordChoiceIterator(ri);
      do {
        word.choices.push({
          text: wc.GetUTF8Text(),
          confidence: wc.Confidence(),
        });
      } while (wc.Next());
      TessModule.destroy(wc);
      textline.words.push(word);
    }

    // let image = null;
    // var pix = ri.GetBinaryImage(TessModule.RIL_SYMBOL)
    // var image = pix2array(pix);
    // // for some reason it seems that things stop working if you destroy pics
    // TessModule._pixDestroy(TessModule.getPointer(pix));
    if (ri.IsAtBeginningOf(RIL_SYMBOL)) {
      symbol = {
        choices: [],
        image: null,
        text: ri.GetUTF8Text(RIL_SYMBOL),
        confidence: ri.Confidence(RIL_SYMBOL),
        baseline: ri.getBaseline(RIL_SYMBOL),
        bbox: ri.getBoundingBox(RIL_SYMBOL),
        is_superscript: !!ri.SymbolIsSuperscript(),
        is_subscript: !!ri.SymbolIsSubscript(),
        is_dropcap: !!ri.SymbolIsDropcap(),
      };
      word.symbols.push(symbol);
      const ci = new TessModule.ChoiceIterator(ri);
      do {
        symbol.choices.push({
          text: ci.GetUTF8Text(),
          confidence: ci.Confidence(),
        });
      } while (ci.Next());
      // TessModule.destroy(i);
    }
  } while (ri.Next(RIL_SYMBOL));
  TessModule.destroy(ri);

  return {
    text: api.GetUTF8Text(),
    hocr: tessjs_create_hocr === '1' ? deindent(api.GetHOCRText()) : null,
    tsv: tessjs_create_tsv === '1' ? api.GetTSVText() : null,
    box: tessjs_create_box === '1' ? api.GetBoxText() : null,
    unlv: tessjs_create_unlv === '1' ? api.GetUNLVText() : null,
    osd: tessjs_create_osd === '1' ? api.GetOsdText() : null,
    confidence: api.MeanTextConf(),
    blocks,
    psm: enumToString(api.GetPageSegMode(), 'PSM'),
    oem: enumToString(api.oem(), 'OEM'),
    version: api.Version(),
  };
};