hebrewprober.js 14.2 KB

Raw Blame History Permalink

/*
 * The Original Code is Mozilla Universal charset detector code.
 *
 * The Initial Developer of the Original Code is
 * Netscape Communications Corporation.
 * Portions created by the Initial Developer are Copyright (C) 2001
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
 *   António Afonso (antonio.afonso gmail.com) - port to JavaScript
 *   Mark Pilgrim - port to Python
 *   Shy Shalom - original C code
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 * 02110-1301  USA
 */

// This prober doesn't actually recognize a language or a charset.
// It is a helper prober for the use of the Hebrew model probers

////// General ideas of the Hebrew charset recognition //////
//
// Four main charsets exist in Hebrew:
// "ISO-8859-8" - Visual Hebrew
// "windows-1255" - Logical Hebrew
// "ISO-8859-8-I" - Logical Hebrew
// "x-mac-hebrew" - ?? Logical Hebrew ??
//
// Both "ISO" charsets use a completely identical set of code points, whereas
// "windows-1255" and "x-mac-hebrew" are two different proper supersets of
// these code points. windows-1255 defines additional characters in the range
// 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific
// diacritics and additional 'Yiddish' ligature letters in the range 0xc0-0xd6.
// x-mac-hebrew defines similar additional code points but with a different
// mapping.
//
// As far as an average Hebrew text with no diacritics is concerned, all four
// charsets are identical with respect to code points. Meaning that for the
// main Hebrew alphabet, all four map the same values to all 27 Hebrew letters
// (including final letters).
//
// The dominant difference between these charsets is their directionality.
// "Visual" directionality means that the text is ordered as if the renderer is
// not aware of a BIDI rendering algorithm. The renderer sees the text and
// draws it from left to right. The text itself when ordered naturally is read
// backwards. A buffer of Visual Hebrew generally looks like so:
// "[last word of first line spelled backwards] [whole line ordered backwards
// and spelled backwards] [first word of first line spelled backwards]
// [end of line] [last word of second line] ... etc' "
// adding punctuation marks, numbers and English text to visual text is
// naturally also "visual" and from left to right.
//
// "Logical" directionality means the text is ordered "naturally" according to
// the order it is read. It is the responsibility of the renderer to display
// the text from right to left. A BIDI algorithm is used to place general
// punctuation marks, numbers and English text in the text.
//
// Texts in x-mac-hebrew are almost impossible to find on the Internet. From
// what little evidence I could find, it seems that its general directionality
// is Logical.
//
// To sum up all of the above, the Hebrew probing mechanism knows about two
// charsets:
// Visual Hebrew - "ISO-8859-8" - backwards text - Words and sentences are
//    backwards while line order is natural. For charset recognition purposes
//    the line order is unimportant (In fact, for this implementation, even
//    word order is unimportant).
// Logical Hebrew - "windows-1255" - normal, naturally ordered text.
//
// "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be
//    specifically identified.
// "x-mac-hebrew" is also identified as windows-1255. A text in x-mac-hebrew
//    that contain special punctuation marks or diacritics is displayed with
//    some unconverted characters showing as question marks. This problem might
//    be corrected using another model prober for x-mac-hebrew. Due to the fact
//    that x-mac-hebrew texts are so rare, writing another model prober isn't
//    worth the effort and performance hit.
//
//////// The Prober ////////
//
// The prober is divided between two SBCharSetProbers and a HebrewProber,
// all of which are managed, created, fed data, inquired and deleted by the
// SBCSGroupProber. The two SBCharSetProbers identify that the text is in
// fact some kind of Hebrew, Logical or Visual. The final decision about which
// one is it is made by the HebrewProber by combining final-letter scores
// with the scores of the two SBCharSetProbers to produce a final answer.
//
// The SBCSGroupProber is responsible for stripping the original text of HTML
// tags, English characters, numbers, low-ASCII punctuation characters, spaces
// and new lines. It reduces any sequence of such characters to a single space.
// The buffer fed to each prober in the SBCS group prober is pure text in
// high-ASCII.
// The two SBCharSetProbers (model probers) share the same language model:
// Win1255Model.
// The first SBCharSetProber uses the model normally as any other
// SBCharSetProber does, to recognize windows-1255, upon which this model was
// built. The second SBCharSetProber is told to make the pair-of-letter
// lookup in the language model backwards. This in practice exactly simulates
// a visual Hebrew model using the windows-1255 logical Hebrew model.
//
// The HebrewProber is not using any language model. All it does is look for
// final-letter evidence suggesting the text is either logical Hebrew or visual
// Hebrew. Disjointed from the model probers, the results of the HebrewProber
// alone are meaningless. HebrewProber always returns 0.00 as confidence
// since it never identifies a charset by itself. Instead, the pointer to the
// HebrewProber is passed to the model probers as a helper "Name Prober".
// When the Group prober receives a positive identification from any prober,
// it asks for the name of the charset identified. If the prober queried is a
// Hebrew model prober, the model prober forwards the call to the
// HebrewProber to make the final decision. In the HebrewProber, the
// decision is made according to the final-letters scores maintained and Both
// model probers scores. The answer is returned in the form of the name of the
// charset identified, either "windows-1255" or "ISO-8859-8".

var CharSetProber = require('./charsetprober');
var constants = require('./constants')

// https://developer.mozilla.org/En/Core_JavaScript_1.5_Reference/Objects/Array/IndexOf
if (!Array.prototype.indexOf)
{
    Array.prototype.indexOf = function(elt /*, from*/)
    {
        var len = this.length >>> 0;

        var from = Number(arguments[1]) || 0;
        from = (from < 0)
             ? Math.ceil(from)
             : Math.floor(from);
        if (from < 0)
            from += len;

        for (; from < len; from++)
        {
            if (from in this &&
                this[from] === elt)
                return from;
        }
        return -1;
    };
}

function HebrewProber() {
    CharSetProber.apply(this);

    // windows-1255 / ISO-8859-8 code points of interest
    var FINAL_KAF = '\xea'
    var NORMAL_KAF = '\xeb'
    var FINAL_MEM = '\xed'
    var NORMAL_MEM = '\xee'
    var FINAL_NUN = '\xef'
    var NORMAL_NUN = '\xf0'
    var FINAL_PE = '\xf3'
    var NORMAL_PE = '\xf4'
    var FINAL_TSADI = '\xf5'
    var NORMAL_TSADI = '\xf6'

    // Minimum Visual vs Logical final letter score difference.
    // If the difference is below this, don't rely solely on the final letter score distance.
    var MIN_FINAL_CHAR_DISTANCE = 5

    // Minimum Visual vs Logical model score difference.
    // If the difference is below this, don't rely at all on the model score distance.
    var MIN_MODEL_DISTANCE = 0.01

    var VISUAL_HEBREW_NAME = "ISO-8859-8"
    var LOGICAL_HEBREW_NAME = "windows-1255"
    var self = this;

    function init() {
        self._mLogicalProber = null;
        self._mVisualProber = null;
        self.reset();
    }

    this.reset = function() {
        this._mFinalCharLogicalScore = 0;
        this._mFinalCharVisualScore = 0;
        // The two last characters seen in the previous buffer,
        // mPrev and mBeforePrev are initialized to space in order to simulate a word
        // delimiter at the beginning of the data
        this._mPrev = " ";
        this._mBeforePrev = " ";
        // These probers are owned by the group prober.
    }

    this.setModelProbers = function(logicalProber, visualProber) {
        this._mLogicalProber = logicalProber;
        this._mVisualProber = visualProber;
    }

    this.isFinal = function(c) {
        return [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, FINAL_TSADI].indexOf(c) != -1;
    }

    this.isNonFinal = function(c) {
        // The normal Tsadi is not a good Non-Final letter due to words like
        // 'lechotet' (to chat) containing an apostrophe after the tsadi. This
        // apostrophe is converted to a space in FilterWithoutEnglishLetters causing
        // the Non-Final tsadi to appear at an end of a word even though this is not
        // the case in the original text.
        // The letters Pe and Kaf rarely display a related behavior of not being a
        // good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for
        // example legally end with a Non-Final Pe or Kaf. However, the benefit of
        // these letters as Non-Final letters outweighs the damage since these words
        // are quite rare.
        return [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE].indexOf(c) != -1;
    }

    this.feed = function(aBuf) {
        // Final letter analysis for logical-visual decision.
        // Look for evidence that the received buffer is either logical Hebrew or
        // visual Hebrew.
        // The following cases are checked:
        // 1) A word longer than 1 letter, ending with a final letter. This is an
        //    indication that the text is laid out "naturally" since the final letter
        //    really appears at the end. +1 for logical score.
        // 2) A word longer than 1 letter, ending with a Non-Final letter. In normal
        //    Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with
        //    the Non-Final form of that letter. Exceptions to this rule are mentioned
        //    above in isNonFinal(). This is an indication that the text is laid out
        //    backwards. +1 for visual score
        // 3) A word longer than 1 letter, starting with a final letter. Final letters
        //    should not appear at the beginning of a word. This is an indication that
        //    the text is laid out backwards. +1 for visual score.
        //
        // The visual score and logical score are accumulated throughout the text and
        // are finally checked against each other in GetCharSetName().
        // No checking for final letters in the middle of words is done since that case
        // is not an indication for either Logical or Visual text.
        //
        // We automatically filter out all 7-bit characters (replace them with spaces)
        // so the word boundary detection works properly. [MAP]

        if( this.getState() == constants.notMe ) {
            // Both model probers say it's not them. No reason to continue.
            return constants.notMe;
        }

        aBuf = this.filterHighBitOnly(aBuf);

        for( var i = 0, cur; i < aBuf.length; i++ ) {
            cur = aBuf[i];
            if( cur == " " ) {
                // We stand on a space - a word just ended
                if( this._mBeforePrev != " " ) {
                    // next-to-last char was not a space so self._mPrev is not a 1 letter word
                    if( this.isFinal(this._mPrev) ) {
                        // case (1) [-2:not space][-1:final letter][cur:space]
                        this._mFinalCharLogicalScore++;
                    } else if( this.isNonFinal(this._mPrev) ) {
                        // case (2) [-2:not space][-1:Non-Final letter][cur:space]
                        this._mFinalCharVisualScore++;
                    }
                }
            } else {
                // Not standing on a space
                if( this._mBeforePrev == " " && this.isFinal(this._mPrev) && cur != " " ) {
                    // case (3) [-2:space][-1:final letter][cur:not space]
                    this._mFinalCharVisualScore++;
                }
            }
            this._mBeforePrev = this._mPrev;
            this._mPrev = cur;
        }
        // Forever detecting, till the end or until both model probers return eNotMe (handled above)
        return constants.detecting;
    }

    this.getCharsetName = function() {
        // Make the decision: is it Logical or Visual?
        // If the final letter score distance is dominant enough, rely on it.
        var finalsub = this._mFinalCharLogicalScore - this._mFinalCharVisualScore;
        if( finalsub >= MIN_FINAL_CHAR_DISTANCE ) {
            return LOGICAL_HEBREW_NAME;
        }
        if( finalsub <= -MIN_FINAL_CHAR_DISTANCE ) {
            return VISUAL_HEBREW_NAME;
        }

        // It's not dominant enough, try to rely on the model scores instead.
        var modelsub = this._mLogicalProber.getConfidence() - this._mVisualProber.getConfidence();
        if( modelsub > MIN_MODEL_DISTANCE ) {
            return LOGICAL_HEBREW_NAME;
        }
        if( modelsub < -MIN_MODEL_DISTANCE ) {
            return VISUAL_HEBREW_NAME;
        }

        // Still no good, back to final letter distance, maybe it'll save the day.
        if( finalsub < 0 ) {
            return VISUAL_HEBREW_NAME;
        }

        // (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
        return LOGICAL_HEBREW_NAME;
    }

    this.getState = function() {
        // Remain active as long as any of the model probers are active.
        if( this._mLogicalProber.getState() == constants.notMe &&
            this._mVisualProber.getState() == constants.notMe ) {
            return constants.notMe;
        }
        return constants.detecting;
    }

    init();
}
HebrewProber.prototype = new CharSetProber();

module.exports = HebrewProber