1 /*
2  * This code is derivative of guess.c of Gauche-0.8.3.
3  * The following is the original copyright notice.
4  */
5 
6 /*
7  * guess.c - guessing character encoding
8  *
9  *   Copyright (c) 2000-2003 Shiro Kawai, All rights reserved.
10  *
11  *   Redistribution and use in source and binary forms, with or without
12  *   modification, are permitted provided that the following conditions
13  *   are met:
14  *
15  *   1. Redistributions of source code must retain the above copyright
16  *      notice, this list of conditions and the following disclaimer.
17  *
18  *   2. Redistributions in binary form must reproduce the above copyright
19  *      notice, this list of conditions and the following disclaimer in the
20  *      documentation and/or other materials provided with the distribution.
21  *
22  *   3. Neither the name of the authors nor the names of its contributors
23  *      may be used to endorse or promote products derived from this
24  *      software without specific prior written permission.
25  *
26  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
32  *   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33  *   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34  *   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35  *   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36  *   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37  *
38  */
39 /**
40  * 
41  *
42  * License: BSD 3-clause
43  */
44 module libguess_d.guess.cjk;
45 
46 
47 private static import libguess_d.dfa;
48 private static import libguess_d.encoding;
49 
50 pure nothrow @safe @nogc
51 private void cjk_first_check(const char[] input, ref libguess_d.encoding.libguess_encoding rv)
52 
53 	do
54 	{
55 		static import libguess_d.dfa;
56 
57 		if (input[0] != 0x1B) {
58 			rv = libguess_d.dfa.check_UTF16_BOM(input);
59 		}
60 	}
61 
62 /**
63  * Inferring character encoding from a Japanese string
64  *
65  * Params:
66  *      input = input string
67  *
68  * Returns: Character encoding or null
69  */
70 pure nothrow @trusted @nogc
71 public libguess_d.encoding.libguess_encoding guess_ja(const char[] input)
72 
73 	in
74 	{
75 	}
76 
77 	do
78 	{
79 		static import libguess_d.dfa;
80 		static import libguess_d.encoding;
81 
82 		if (input.length == 0) {
83 			return libguess_d.encoding.libguess_encoding.undefined;
84 		}
85 
86 		libguess_d.dfa.guess_dfa[3] ja_order = [libguess_d.encoding.utf8, libguess_d.encoding.sjis, libguess_d.encoding.eucjp];
87 		libguess_d.encoding.libguess_encoding rv = libguess_d.encoding.libguess_encoding.undefined;
88 
89 		.cjk_first_check(input, rv);
90 
91 		if (rv != libguess_d.encoding.libguess_encoding.undefined) {
92 			return rv;
93 		}
94 
95 		for (size_t i = 0; i < input.length; i++) {
96 			ubyte c = cast(ubyte)(input[i]);
97 
98 			/* special treatment of iso-2022 escape sequence */
99 			if (c == 0x1B) {
100 				if (i < input.length - 1) {
101 					c = cast(ubyte)(input[++i]);
102 
103 					if ((c == '$') || (c == '(')) {
104 						return libguess_d.encoding.libguess_encoding.ISO_2022_JP;
105 					}
106 				}
107 			}
108 
109 			rv = libguess_d.dfa.dfa_process(ja_order, c);
110 
111 			if (rv != libguess_d.encoding.libguess_encoding.undefined) {
112 				return rv;
113 			}
114 
115 			if (libguess_d.dfa.dfa_none(ja_order)) {
116 				/* we ran out the possibilities */
117 				return libguess_d.encoding.libguess_encoding.undefined;
118 			}
119 		}
120 
121 		rv = libguess_d.dfa.dfa_top(ja_order);
122 
123 		return (rv != libguess_d.encoding.libguess_encoding.undefined) ? (rv) : (libguess_d.encoding.libguess_encoding.undefined);
124 	}
125 
126 /**
127  * Inferring character encoding from a Taiwanese string
128  *
129  * Params:
130  *      input = input string
131  *
132  * Returns: Character encoding or null
133  */
134 pure nothrow @trusted @nogc
135 public libguess_d.encoding.libguess_encoding guess_tw(const char[] input)
136 
137 	in
138 	{
139 	}
140 
141 	do
142 	{
143 		static import libguess_d.dfa;
144 		static import libguess_d.encoding;
145 
146 		if (input.length == 0) {
147 			return libguess_d.encoding.libguess_encoding.undefined;
148 		}
149 
150 		libguess_d.dfa.guess_dfa[2] tw_order = [libguess_d.encoding.utf8, libguess_d.encoding.big5];
151 		libguess_d.encoding.libguess_encoding rv = libguess_d.encoding.libguess_encoding.undefined;
152 
153 		.cjk_first_check(input, rv);
154 
155 		if (rv != libguess_d.encoding.libguess_encoding.undefined) {
156 			return rv;
157 		}
158 
159 		for (size_t i = 0; i < input.length; i++) {
160 			ubyte c = cast(ubyte)(input[i]);
161 
162 			/* special treatment of iso-2022 escape sequence */
163 			if (c == 0x1B) {
164 				if (i < input.length - 1) {
165 					c = cast(ubyte)(input[++i]);
166 
167 					if ((c == '$') || (c == '(')) {
168 						return libguess_d.encoding.libguess_encoding.ISO_2022_TW;
169 					}
170 				}
171 			}
172 
173 			rv = libguess_d.dfa.dfa_process(tw_order, c);
174 
175 			if (rv != libguess_d.encoding.libguess_encoding.undefined) {
176 				return rv;
177 			}
178 
179 			if (libguess_d.dfa.dfa_none(tw_order)) {
180 				/* we ran out the possibilities */
181 				return libguess_d.encoding.libguess_encoding.undefined;
182 			}
183 		}
184 
185 		rv = libguess_d.dfa.dfa_top(tw_order);
186 
187 		return (rv != libguess_d.encoding.libguess_encoding.undefined) ? (rv) : (libguess_d.encoding.libguess_encoding.undefined);
188 	}
189 
190 /**
191  * Inferring character encoding from a Chinese string
192  *
193  * Params:
194  *      input = input string
195  *
196  * Returns: Character encoding or null
197  */
198 pure nothrow @trusted @nogc
199 public libguess_d.encoding.libguess_encoding guess_cn(const char[] input)
200 
201 	in
202 	{
203 	}
204 
205 	do
206 	{
207 		static import libguess_d.dfa;
208 		static import libguess_d.encoding;
209 
210 		if (input.length == 0) {
211 			return libguess_d.encoding.libguess_encoding.undefined;
212 		}
213 
214 		libguess_d.dfa.guess_dfa[3] cn_order = [libguess_d.encoding.utf8, libguess_d.encoding.gb2312, libguess_d.encoding.gb18030];
215 		libguess_d.encoding.libguess_encoding rv = libguess_d.encoding.libguess_encoding.undefined;
216 
217 		.cjk_first_check(input, rv);
218 
219 		if (rv != libguess_d.encoding.libguess_encoding.undefined) {
220 			return rv;
221 		}
222 
223 		for (size_t i = 0; i < input.length; i++) {
224 			ubyte c = cast(ubyte)(input[i]);
225 			ubyte c2;
226 
227 			/* special treatment of iso-2022 escape sequence */
228 			if (c == 0x1B) {
229 				if (i < input.length - 1) {
230 					c = cast(ubyte)(input[i + 1]);
231 					c2 = cast(ubyte)(input[i + 2]);
232 
233 					if ((c == '$') && ((c2 == ')') || (c2 == '+'))) {
234 						return libguess_d.encoding.libguess_encoding.ISO_2022_CN;
235 					}
236 				}
237 			}
238 
239 			rv = libguess_d.dfa.dfa_process(cn_order, c);
240 
241 			if (rv != libguess_d.encoding.libguess_encoding.undefined) {
242 				return rv;
243 			}
244 
245 			if (libguess_d.dfa.dfa_none(cn_order)) {
246 				/* we ran out the possibilities */
247 				return libguess_d.encoding.libguess_encoding.undefined;
248 			}
249 		}
250 
251 		rv = libguess_d.dfa.dfa_top(cn_order);
252 
253 		return (rv != libguess_d.encoding.libguess_encoding.undefined) ? (rv) : (libguess_d.encoding.libguess_encoding.undefined);
254 	}
255 
256 /**
257  * Inferring character encoding from a Korean string
258  *
259  * Params:
260  *      input = input string
261  *
262  * Returns: Character encoding or null
263  */
264 pure nothrow @trusted @nogc
265 public libguess_d.encoding.libguess_encoding guess_kr(const char[] input)
266 
267 	in
268 	{
269 	}
270 
271 	do
272 	{
273 		static import libguess_d.dfa;
274 		static import libguess_d.encoding;
275 
276 		if (input.length == 0) {
277 			return libguess_d.encoding.libguess_encoding.undefined;
278 		}
279 
280 		libguess_d.dfa.guess_dfa[3] kr_order = [libguess_d.encoding.utf8, libguess_d.encoding.euckr, libguess_d.encoding.johab];
281 		libguess_d.encoding.libguess_encoding rv = libguess_d.encoding.libguess_encoding.undefined;
282 
283 		.cjk_first_check(input, rv);
284 
285 		if (rv != libguess_d.encoding.libguess_encoding.undefined) {
286 			return rv;
287 		}
288 
289 		for (size_t i = 0; i < input.length; i++) {
290 			ubyte c = cast(ubyte)(input[i]);
291 			ubyte c2;
292 
293 			/* special treatment of iso-2022 escape sequence */
294 			if (c == 0x1B) {
295 				if (i < input.length - 1) {
296 					c = cast(ubyte)(input[i + 1]);
297 					c2 = cast(ubyte)(input[i + 2]);
298 
299 					if ((c == '$') && (c2 == ')')) {
300 						return libguess_d.encoding.libguess_encoding.ISO_2022_KR;
301 					}
302 				}
303 			}
304 
305 			rv = libguess_d.dfa.dfa_process(kr_order, c);
306 
307 			if (rv != libguess_d.encoding.libguess_encoding.undefined) {
308 				return rv;
309 			}
310 
311 			if (libguess_d.dfa.dfa_none(kr_order)) {
312 				/* we ran out the possibilities */
313 				return libguess_d.encoding.libguess_encoding.undefined;
314 			}
315 		}
316 
317 		rv = libguess_d.dfa.dfa_top(kr_order);
318 
319 		return (rv != libguess_d.encoding.libguess_encoding.undefined) ? (rv) : (libguess_d.encoding.libguess_encoding.undefined);
320 	}