iso2022.js
3.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
var util = require('util'),
Match = require ('../match');
/**
* This is a superclass for the individual detectors for
* each of the detectable members of the ISO 2022 family
* of encodings.
*/
function ISO_2022() {}
ISO_2022.prototype.match = function(det) {
/**
* Matching function shared among the 2022 detectors JP, CN and KR
* Counts up the number of legal an unrecognized escape sequences in
* the sample of text, and computes a score based on the total number &
* the proportion that fit the encoding.
*
*
* @param text the byte buffer containing text to analyse
* @param textLen the size of the text in the byte.
* @param escapeSequences the byte escape sequences to test for.
* @return match quality, in the range of 0-100.
*/
var i, j;
var escN;
var hits = 0;
var misses = 0;
var shifts = 0;
var quality;
// TODO: refactor me
var text = det.fInputBytes;
var textLen = det.fInputLen;
scanInput:
for (i = 0; i < textLen; i++) {
if (text[i] == 0x1b) {
checkEscapes:
for (escN = 0; escN < this.escapeSequences.length; escN++) {
var seq = this.escapeSequences[escN];
if ((textLen - i) < seq.length)
continue checkEscapes;
for (j = 1; j < seq.length; j++)
if (seq[j] != text[i + j])
continue checkEscapes;
hits++;
i += seq.length - 1;
continue scanInput;
}
misses++;
}
// Shift in/out
if (text[i] == 0x0e || text[i] == 0x0f)
shifts++;
}
if (hits == 0)
return null;
//
// Initial quality is based on relative proportion of recongized vs.
// unrecognized escape sequences.
// All good: quality = 100;
// half or less good: quality = 0;
// linear inbetween.
quality = (100 * hits - 100 * misses) / (hits + misses);
// Back off quality if there were too few escape sequences seen.
// Include shifts in this computation, so that KR does not get penalized
// for having only a single Escape sequence, but many shifts.
if (hits + shifts < 5)
quality -= (5 - (hits + shifts)) * 10;
return quality <= 0 ? null : new Match(det, this, quality);
};
module.exports.ISO_2022_JP = function() {
this.name = function() {
return 'ISO-2022-JP';
};
this.escapeSequences = [
[ 0x1b, 0x24, 0x28, 0x43 ], // KS X 1001:1992
[ 0x1b, 0x24, 0x28, 0x44 ], // JIS X 212-1990
[ 0x1b, 0x24, 0x40 ], // JIS C 6226-1978
[ 0x1b, 0x24, 0x41 ], // GB 2312-80
[ 0x1b, 0x24, 0x42 ], // JIS X 208-1983
[ 0x1b, 0x26, 0x40 ], // JIS X 208 1990, 1997
[ 0x1b, 0x28, 0x42 ], // ASCII
[ 0x1b, 0x28, 0x48 ], // JIS-Roman
[ 0x1b, 0x28, 0x49 ], // Half-width katakana
[ 0x1b, 0x28, 0x4a ], // JIS-Roman
[ 0x1b, 0x2e, 0x41 ], // ISO 8859-1
[ 0x1b, 0x2e, 0x46 ] // ISO 8859-7
];
};
util.inherits(module.exports.ISO_2022_JP, ISO_2022);
module.exports.ISO_2022_KR = function() {
this.name = function() {
return 'ISO-2022-KR';
};
this.escapeSequences = [
[ 0x1b, 0x24, 0x29, 0x43 ]
];
};
util.inherits(module.exports.ISO_2022_KR, ISO_2022);
module.exports.ISO_2022_CN = function() {
this.name = function() {
return 'ISO-2022-CN';
};
this.escapeSequences = [
[ 0x1b, 0x24, 0x29, 0x41 ], // GB 2312-80
[ 0x1b, 0x24, 0x29, 0x47 ], // CNS 11643-1992 Plane 1
[ 0x1b, 0x24, 0x2A, 0x48 ], // CNS 11643-1992 Plane 2
[ 0x1b, 0x24, 0x29, 0x45 ], // ISO-IR-165
[ 0x1b, 0x24, 0x2B, 0x49 ], // CNS 11643-1992 Plane 3
[ 0x1b, 0x24, 0x2B, 0x4A ], // CNS 11643-1992 Plane 4
[ 0x1b, 0x24, 0x2B, 0x4B ], // CNS 11643-1992 Plane 5
[ 0x1b, 0x24, 0x2B, 0x4C ], // CNS 11643-1992 Plane 6
[ 0x1b, 0x24, 0x2B, 0x4D ], // CNS 11643-1992 Plane 7
[ 0x1b, 0x4e ], // SS2
[ 0x1b, 0x4f ] // SS3
];
};
util.inherits(module.exports.ISO_2022_CN, ISO_2022);