dbcs-data.js 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. // Description of supported double byte encodings and aliases.
  2. // Tables are not require()-d until they are needed to speed up library load.
  3. // require()-s are direct to support Browserify.
  4. module.exports = {
  5. // == Japanese/ShiftJIS ====================================================
  6. // All japanese encodings are based on JIS X set of standards:
  7. // JIS X 0201 - Single-byte encoding of ASCII + ¥ + Kana chars at 0xA1-0xDF.
  8. // JIS X 0208 - Main set of 6879 characters, placed in 94x94 plane, to be encoded by 2 bytes.
  9. // Has several variations in 1978, 1983, 1990 and 1997.
  10. // JIS X 0212 - Supplementary plane of 6067 chars in 94x94 plane. 1990. Effectively dead.
  11. // JIS X 0213 - Extension and modern replacement of 0208 and 0212. Total chars: 11233.
  12. // 2 planes, first is superset of 0208, second - revised 0212.
  13. // Introduced in 2000, revised 2004. Some characters are in Unicode Plane 2 (0x2xxxx)
  14. // Byte encodings are:
  15. // * Shift_JIS: Compatible with 0201, uses not defined chars in top half as lead bytes for double-byte
  16. // encoding of 0208. Lead byte ranges: 0x81-0x9F, 0xE0-0xEF; Trail byte ranges: 0x40-0x7E, 0x80-0x9E, 0x9F-0xFC.
  17. // Windows CP932 is a superset of Shift_JIS. Some companies added more chars, notably KDDI.
  18. // * EUC-JP: Up to 3 bytes per character. Used mostly on *nixes.
  19. // 0x00-0x7F - lower part of 0201
  20. // 0x8E, 0xA1-0xDF - upper part of 0201
  21. // (0xA1-0xFE)x2 - 0208 plane (94x94).
  22. // 0x8F, (0xA1-0xFE)x2 - 0212 plane (94x94).
  23. // * JIS X 208: 7-bit, direct encoding of 0208. Byte ranges: 0x21-0x7E (94 values). Uncommon.
  24. // Used as-is in ISO2022 family.
  25. // * ISO2022-JP: Stateful encoding, with escape sequences to switch between ASCII,
  26. // 0201-1976 Roman, 0208-1978, 0208-1983.
  27. // * ISO2022-JP-1: Adds esc seq for 0212-1990.
  28. // * ISO2022-JP-2: Adds esc seq for GB2313-1980, KSX1001-1992, ISO8859-1, ISO8859-7.
  29. // * ISO2022-JP-3: Adds esc seq for 0201-1976 Kana set, 0213-2000 Planes 1, 2.
  30. // * ISO2022-JP-2004: Adds 0213-2004 Plane 1.
  31. //
  32. // After JIS X 0213 appeared, Shift_JIS-2004, EUC-JISX0213 and ISO2022-JP-2004 followed, with just changing the planes.
  33. //
  34. // Overall, it seems that it's a mess :( http://www8.plala.or.jp/tkubota1/unicode-symbols-map2.html
  35. 'shiftjis': {
  36. type: '_dbcs',
  37. table: function() { return require('./tables/shiftjis.json') },
  38. encodeAdd: {'\u00a5': 0x5C, '\u203E': 0x7E},
  39. encodeSkipVals: [{from: 0xED40, to: 0xF940}],
  40. },
  41. 'csshiftjis': 'shiftjis',
  42. 'mskanji': 'shiftjis',
  43. 'sjis': 'shiftjis',
  44. 'windows-31j': 'shiftjis',
  45. 'x-sjis': 'shiftjis',
  46. 'windows932': 'shiftjis',
  47. '932': 'shiftjis',
  48. 'cp932': 'shiftjis',
  49. 'eucjp': {
  50. type: '_dbcs',
  51. table: function() { return require('./tables/eucjp.json') },
  52. encodeAdd: {'\u00a5': 0x5C, '\u203E': 0x7E},
  53. },
  54. // TODO: KDDI extension to Shift_JIS
  55. // TODO: IBM CCSID 942 = CP932, but F0-F9 custom chars and other char changes.
  56. // TODO: IBM CCSID 943 = Shift_JIS = CP932 with original Shift_JIS lower 128 chars.
  57. // == Chinese/GBK ==========================================================
  58. // http://en.wikipedia.org/wiki/GBK
  59. // Oldest GB2312 (1981, ~7600 chars) is a subset of CP936
  60. 'gb2312': 'cp936',
  61. 'gb231280': 'cp936',
  62. 'gb23121980': 'cp936',
  63. 'csgb2312': 'cp936',
  64. 'csiso58gb231280': 'cp936',
  65. 'euccn': 'cp936',
  66. 'isoir58': 'gbk',
  67. // Microsoft's CP936 is a subset and approximation of GBK.
  68. // TODO: Euro = 0x80 in cp936, but not in GBK (where it's valid but undefined)
  69. 'windows936': 'cp936',
  70. '936': 'cp936',
  71. 'cp936': {
  72. type: '_dbcs',
  73. table: function() { return require('./tables/cp936.json') },
  74. },
  75. // GBK (~22000 chars) is an extension of CP936 that added user-mapped chars and some other.
  76. 'gbk': {
  77. type: '_dbcs',
  78. table: function() { return require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) },
  79. },
  80. 'xgbk': 'gbk',
  81. // GB18030 is an algorithmic extension of GBK.
  82. 'gb18030': {
  83. type: '_dbcs',
  84. table: function() { return require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) },
  85. gb18030: function() { return require('./tables/gb18030-ranges.json') },
  86. },
  87. 'chinese': 'gb18030',
  88. // TODO: Support GB18030 (~27000 chars + whole unicode mapping, cp54936)
  89. // http://icu-project.org/docs/papers/gb18030.html
  90. // http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/gb-18030-2000.xml
  91. // http://www.khngai.com/chinese/charmap/tblgbk.php?page=0
  92. // == Korean ===============================================================
  93. // EUC-KR, KS_C_5601 and KS X 1001 are exactly the same.
  94. 'windows949': 'cp949',
  95. '949': 'cp949',
  96. 'cp949': {
  97. type: '_dbcs',
  98. table: function() { return require('./tables/cp949.json') },
  99. },
  100. 'cseuckr': 'cp949',
  101. 'csksc56011987': 'cp949',
  102. 'euckr': 'cp949',
  103. 'isoir149': 'cp949',
  104. 'korean': 'cp949',
  105. 'ksc56011987': 'cp949',
  106. 'ksc56011989': 'cp949',
  107. 'ksc5601': 'cp949',
  108. // == Big5/Taiwan/Hong Kong ================================================
  109. // There are lots of tables for Big5 and cp950. Please see the following links for history:
  110. // http://moztw.org/docs/big5/ http://www.haible.de/bruno/charsets/conversion-tables/Big5.html
  111. // Variations, in roughly number of defined chars:
  112. // * Windows CP 950: Microsoft variant of Big5. Canonical: http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT
  113. // * Windows CP 951: Microsoft variant of Big5-HKSCS-2001. Seems to be never public. http://me.abelcheung.org/articles/research/what-is-cp951/
  114. // * Big5-2003 (Taiwan standard) almost superset of cp950.
  115. // * Unicode-at-on (UAO) / Mozilla 1.8. Falling out of use on the Web. Not supported by other browsers.
  116. // * Big5-HKSCS (-2001, -2004, -2008). Hong Kong standard.
  117. // many unicode code points moved from PUA to Supplementary plane (U+2XXXX) over the years.
  118. // Plus, it has 4 combining sequences.
  119. // Seems that Mozilla refused to support it for 10 yrs. https://bugzilla.mozilla.org/show_bug.cgi?id=162431 https://bugzilla.mozilla.org/show_bug.cgi?id=310299
  120. // because big5-hkscs is the only encoding to include astral characters in non-algorithmic way.
  121. // Implementations are not consistent within browsers; sometimes labeled as just big5.
  122. // MS Internet Explorer switches from big5 to big5-hkscs when a patch applied.
  123. // Great discussion & recap of what's going on https://bugzilla.mozilla.org/show_bug.cgi?id=912470#c31
  124. // In the encoder, it might make sense to support encoding old PUA mappings to Big5 bytes seq-s.
  125. // Official spec: http://www.ogcio.gov.hk/en/business/tech_promotion/ccli/terms/doc/2003cmp_2008.txt
  126. // http://www.ogcio.gov.hk/tc/business/tech_promotion/ccli/terms/doc/hkscs-2008-big5-iso.txt
  127. //
  128. // Current understanding of how to deal with Big5(-HKSCS) is in the Encoding Standard, http://encoding.spec.whatwg.org/#big5-encoder
  129. // Unicode mapping (http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/BIG5.TXT) is said to be wrong.
  130. 'windows950': 'cp950',
  131. '950': 'cp950',
  132. 'cp950': {
  133. type: '_dbcs',
  134. table: function() { return require('./tables/cp950.json') },
  135. },
  136. // Big5 has many variations and is an extension of cp950. We use Encoding Standard's as a consensus.
  137. 'big5': 'big5hkscs',
  138. 'big5hkscs': {
  139. type: '_dbcs',
  140. table: function() { return require('./tables/cp950.json').concat(require('./tables/big5-added.json')) },
  141. },
  142. 'cnbig5': 'big5hkscs',
  143. 'csbig5': 'big5hkscs',
  144. 'xxbig5': 'big5hkscs',
  145. };