1 /* 2 * This code is derivative of guess.c of Gauche-0.8.3. 3 * The following is the original copyright notice. 4 */ 5 6 /* 7 * guess.c - guessing character encoding 8 * 9 * Copyright (c) 2000-2003 Shiro Kawai, All rights reserved. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * 3. Neither the name of the authors nor the names of its contributors 23 * may be used to endorse or promote products derived from this 24 * software without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 27 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 28 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 29 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 30 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 31 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 32 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 33 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 34 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 35 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 36 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 * 38 */ 39 /** 40 * 41 * 42 * License: BSD 3-clause 43 */ 44 module libguess_d.guess.cjk; 45 46 47 private static import libguess_d.dfa; 48 private static import libguess_d.encoding; 49 50 pure nothrow @safe @nogc 51 private void cjk_first_check(const char[] input, ref libguess_d.encoding.libguess_encoding rv) 52 53 do 54 { 55 static import libguess_d.dfa; 56 57 if (input[0] != 0x1B) { 58 rv = libguess_d.dfa.check_UTF16_BOM(input); 59 } 60 } 61 62 /** 63 * Inferring character encoding from a Japanese string 64 * 65 * Params: 66 * input = input string 67 * 68 * Returns: Character encoding or null 69 */ 70 pure nothrow @trusted @nogc 71 public libguess_d.encoding.libguess_encoding guess_ja(const char[] input) 72 73 in 74 { 75 } 76 77 do 78 { 79 static import libguess_d.dfa; 80 static import libguess_d.encoding; 81 82 if (input.length == 0) { 83 return libguess_d.encoding.libguess_encoding.undefined; 84 } 85 86 libguess_d.dfa.guess_dfa[3] ja_order = [libguess_d.encoding.utf8, libguess_d.encoding.sjis, libguess_d.encoding.eucjp]; 87 libguess_d.encoding.libguess_encoding rv = libguess_d.encoding.libguess_encoding.undefined; 88 89 .cjk_first_check(input, rv); 90 91 if (rv != libguess_d.encoding.libguess_encoding.undefined) { 92 return rv; 93 } 94 95 for (size_t i = 0; i < input.length; i++) { 96 ubyte c = cast(ubyte)(input[i]); 97 98 /* special treatment of iso-2022 escape sequence */ 99 if (c == 0x1B) { 100 if (i < input.length - 1) { 101 c = cast(ubyte)(input[++i]); 102 103 if ((c == '$') || (c == '(')) { 104 return libguess_d.encoding.libguess_encoding.ISO_2022_JP; 105 } 106 } 107 } 108 109 rv = libguess_d.dfa.dfa_process(ja_order, c); 110 111 if (rv != libguess_d.encoding.libguess_encoding.undefined) { 112 return rv; 113 } 114 115 if (libguess_d.dfa.dfa_none(ja_order)) { 116 /* we ran out the possibilities */ 117 return libguess_d.encoding.libguess_encoding.undefined; 118 } 119 } 120 121 rv = libguess_d.dfa.dfa_top(ja_order); 122 123 return (rv != libguess_d.encoding.libguess_encoding.undefined) ? (rv) : (libguess_d.encoding.libguess_encoding.undefined); 124 } 125 126 /** 127 * Inferring character encoding from a Taiwanese string 128 * 129 * Params: 130 * input = input string 131 * 132 * Returns: Character encoding or null 133 */ 134 pure nothrow @trusted @nogc 135 public libguess_d.encoding.libguess_encoding guess_tw(const char[] input) 136 137 in 138 { 139 } 140 141 do 142 { 143 static import libguess_d.dfa; 144 static import libguess_d.encoding; 145 146 if (input.length == 0) { 147 return libguess_d.encoding.libguess_encoding.undefined; 148 } 149 150 libguess_d.dfa.guess_dfa[2] tw_order = [libguess_d.encoding.utf8, libguess_d.encoding.big5]; 151 libguess_d.encoding.libguess_encoding rv = libguess_d.encoding.libguess_encoding.undefined; 152 153 .cjk_first_check(input, rv); 154 155 if (rv != libguess_d.encoding.libguess_encoding.undefined) { 156 return rv; 157 } 158 159 for (size_t i = 0; i < input.length; i++) { 160 ubyte c = cast(ubyte)(input[i]); 161 162 /* special treatment of iso-2022 escape sequence */ 163 if (c == 0x1B) { 164 if (i < input.length - 1) { 165 c = cast(ubyte)(input[++i]); 166 167 if ((c == '$') || (c == '(')) { 168 return libguess_d.encoding.libguess_encoding.ISO_2022_TW; 169 } 170 } 171 } 172 173 rv = libguess_d.dfa.dfa_process(tw_order, c); 174 175 if (rv != libguess_d.encoding.libguess_encoding.undefined) { 176 return rv; 177 } 178 179 if (libguess_d.dfa.dfa_none(tw_order)) { 180 /* we ran out the possibilities */ 181 return libguess_d.encoding.libguess_encoding.undefined; 182 } 183 } 184 185 rv = libguess_d.dfa.dfa_top(tw_order); 186 187 return (rv != libguess_d.encoding.libguess_encoding.undefined) ? (rv) : (libguess_d.encoding.libguess_encoding.undefined); 188 } 189 190 /** 191 * Inferring character encoding from a Chinese string 192 * 193 * Params: 194 * input = input string 195 * 196 * Returns: Character encoding or null 197 */ 198 pure nothrow @trusted @nogc 199 public libguess_d.encoding.libguess_encoding guess_cn(const char[] input) 200 201 in 202 { 203 } 204 205 do 206 { 207 static import libguess_d.dfa; 208 static import libguess_d.encoding; 209 210 if (input.length == 0) { 211 return libguess_d.encoding.libguess_encoding.undefined; 212 } 213 214 libguess_d.dfa.guess_dfa[3] cn_order = [libguess_d.encoding.utf8, libguess_d.encoding.gb2312, libguess_d.encoding.gb18030]; 215 libguess_d.encoding.libguess_encoding rv = libguess_d.encoding.libguess_encoding.undefined; 216 217 .cjk_first_check(input, rv); 218 219 if (rv != libguess_d.encoding.libguess_encoding.undefined) { 220 return rv; 221 } 222 223 for (size_t i = 0; i < input.length; i++) { 224 ubyte c = cast(ubyte)(input[i]); 225 ubyte c2; 226 227 /* special treatment of iso-2022 escape sequence */ 228 if (c == 0x1B) { 229 if (i < input.length - 1) { 230 c = cast(ubyte)(input[i + 1]); 231 c2 = cast(ubyte)(input[i + 2]); 232 233 if ((c == '$') && ((c2 == ')') || (c2 == '+'))) { 234 return libguess_d.encoding.libguess_encoding.ISO_2022_CN; 235 } 236 } 237 } 238 239 rv = libguess_d.dfa.dfa_process(cn_order, c); 240 241 if (rv != libguess_d.encoding.libguess_encoding.undefined) { 242 return rv; 243 } 244 245 if (libguess_d.dfa.dfa_none(cn_order)) { 246 /* we ran out the possibilities */ 247 return libguess_d.encoding.libguess_encoding.undefined; 248 } 249 } 250 251 rv = libguess_d.dfa.dfa_top(cn_order); 252 253 return (rv != libguess_d.encoding.libguess_encoding.undefined) ? (rv) : (libguess_d.encoding.libguess_encoding.undefined); 254 } 255 256 /** 257 * Inferring character encoding from a Korean string 258 * 259 * Params: 260 * input = input string 261 * 262 * Returns: Character encoding or null 263 */ 264 pure nothrow @trusted @nogc 265 public libguess_d.encoding.libguess_encoding guess_kr(const char[] input) 266 267 in 268 { 269 } 270 271 do 272 { 273 static import libguess_d.dfa; 274 static import libguess_d.encoding; 275 276 if (input.length == 0) { 277 return libguess_d.encoding.libguess_encoding.undefined; 278 } 279 280 libguess_d.dfa.guess_dfa[3] kr_order = [libguess_d.encoding.utf8, libguess_d.encoding.euckr, libguess_d.encoding.johab]; 281 libguess_d.encoding.libguess_encoding rv = libguess_d.encoding.libguess_encoding.undefined; 282 283 .cjk_first_check(input, rv); 284 285 if (rv != libguess_d.encoding.libguess_encoding.undefined) { 286 return rv; 287 } 288 289 for (size_t i = 0; i < input.length; i++) { 290 ubyte c = cast(ubyte)(input[i]); 291 ubyte c2; 292 293 /* special treatment of iso-2022 escape sequence */ 294 if (c == 0x1B) { 295 if (i < input.length - 1) { 296 c = cast(ubyte)(input[i + 1]); 297 c2 = cast(ubyte)(input[i + 2]); 298 299 if ((c == '$') && (c2 == ')')) { 300 return libguess_d.encoding.libguess_encoding.ISO_2022_KR; 301 } 302 } 303 } 304 305 rv = libguess_d.dfa.dfa_process(kr_order, c); 306 307 if (rv != libguess_d.encoding.libguess_encoding.undefined) { 308 return rv; 309 } 310 311 if (libguess_d.dfa.dfa_none(kr_order)) { 312 /* we ran out the possibilities */ 313 return libguess_d.encoding.libguess_encoding.undefined; 314 } 315 } 316 317 rv = libguess_d.dfa.dfa_top(kr_order); 318 319 return (rv != libguess_d.encoding.libguess_encoding.undefined) ? (rv) : (libguess_d.encoding.libguess_encoding.undefined); 320 }