Ruby  1.9.3p551(2014-11-13revision48407)
euc_jp.c
Go to the documentation of this file.
1 /**********************************************************************
2  euc_jp.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in the
15  * documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regint.h"
31 
32 
33 #define eucjp_islead(c) ((UChar )((c) - 0xa1) > 0xfe - 0xa1)
34 
35 static const int EncLen_EUCJP[] = {
36  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
42  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
43  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
44  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
45  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
47  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
48  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
49  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
50  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
51  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
52 };
53 
54 typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1, S2 } state_t;
55 #define A ACCEPT
56 #define F FAILURE
57 static const signed char trans[][0x100] = {
58  { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */
59  /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
60  /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
61  /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
62  /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
63  /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
64  /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
65  /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
66  /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
67  /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, 1, 2,
68  /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
69  /* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
70  /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
71  /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
72  /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
73  /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
74  /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
75  },
76  { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */
77  /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
78  /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
79  /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
80  /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
81  /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
82  /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
83  /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
84  /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
85  /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
86  /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
87  /* a */ F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
88  /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
89  /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
90  /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
91  /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
92  /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F
93  },
94  { /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */
95  /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
96  /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
97  /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
98  /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
99  /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
100  /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
101  /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
102  /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
103  /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
104  /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
105  /* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
106  /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
107  /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
108  /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
109  /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
110  /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
111  },
112 
113 };
114 #undef A
115 #undef F
116 
117 static int
119 {
120  int firstbyte = *p++;
121  state_t s;
122  s = trans[0][firstbyte];
123  if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
125  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP[firstbyte]-1);
126  s = trans[s][*p++];
127  if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
129  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP[firstbyte]-2);
130  s = trans[s][*p++];
131  return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) :
133 }
134 
135 static OnigCodePoint
137 {
138  int c, i, len;
140 
141  len = enclen(enc, p, end);
142  n = (OnigCodePoint )*p++;
143  if (len == 1) return n;
144 
145  for (i = 1; i < len; i++) {
146  if (p >= end) break;
147  c = *p++;
148  n <<= 8; n += c;
149  }
150  return n;
151 }
152 
153 static int
155 {
156  if (ONIGENC_IS_CODE_ASCII(code)) return 1;
157  else if (code > 0xffffff)
159  else if (code & 0x800000) return 3;
160  else if (code & 0x8000) return 2;
161  else
163 }
164 
165 #if 0
166 static int
167 code_to_mbc_first(OnigCodePoint code)
168 {
169  int first;
170 
171  if ((code & 0xff0000) != 0) {
172  first = (code >> 16) & 0xff;
173  }
174  else if ((code & 0xff00) != 0) {
175  first = (code >> 8) & 0xff;
176  }
177  else {
178  return (int )code;
179  }
180  return first;
181 }
182 #endif
183 
184 static int
186 {
187  UChar *p = buf;
188 
189  if ((code & 0xff0000) != 0) *p++ = (UChar )(((code >> 16) & 0xff));
190  if ((code & 0xff00) != 0) *p++ = (UChar )(((code >> 8) & 0xff));
191  *p++ = (UChar )(code & 0xff);
192 
193 #if 1
194  if (enclen(enc, buf, p) != (p - buf))
196 #endif
197  return (int)(p - buf);
198 }
199 
200 static int
202  const UChar** pp, const UChar* end, UChar* lower,
204 {
205  int len;
206  const UChar* p = *pp;
207 
208  if (ONIGENC_IS_MBC_ASCII(p)) {
210  (*pp)++;
211  return 1;
212  }
213  else {
214  int i;
215 
216  len = enclen(enc, p, end);
217  for (i = 0; i < len; i++) {
218  *lower++ = *p++;
219  }
220  (*pp) += len;
221  return len; /* return byte length of converted char to lower */
222  }
223 }
224 
225 static UChar*
227 {
228  /* In this encoding
229  mb-trail bytes doesn't mix with single bytes.
230  */
231  const UChar *p;
232  int len;
233 
234  if (s <= start) return (UChar* )s;
235  p = s;
236 
237  while (!eucjp_islead(*p) && p > start) p--;
238  len = enclen(enc, p, end);
239  if (p + len > s) return (UChar* )p;
240  p += len;
241  return (UChar* )(p + ((s - p) & ~1));
242 }
243 
244 static int
246 {
247  const UChar c = *s;
248  if (c <= 0x7e || c == 0x8e || c == 0x8f)
249  return TRUE;
250  else
251  return FALSE;
252 }
253 
254 
255 static int PropertyInited = 0;
257 static int PropertyListNum;
258 static int PropertyListSize;
260 
261 static const OnigCodePoint CR_Hiragana[] = {
262  1,
263  0xa4a1, 0xa4f3
264 }; /* CR_Hiragana */
265 
266 static const OnigCodePoint CR_Katakana[] = {
267  3,
268  0xa5a1, 0xa5f6,
269  0xaaa6, 0xaaaf,
270  0xaab1, 0xaadd
271 }; /* CR_Katakana */
272 
273 static int
275 {
276  int r;
277 
278  PROPERTY_LIST_ADD_PROP("hiragana", CR_Hiragana);
279  PROPERTY_LIST_ADD_PROP("katakana", CR_Katakana);
280  PropertyInited = 1;
281 
282  end:
283  return r;
284 }
285 
286 static int
288 {
289  st_data_t ctype;
290  UChar *s, *e;
291 
293 
294  s = e = ALLOCA_N(UChar, end-p+1);
295  for (; p < end; p++) {
297  }
298 
299  if (onig_st_lookup_strend(PropertyNameTable, s, e, &ctype) == 0) {
300  return onigenc_minimum_property_name_to_ctype(enc, s, e);
301  }
302 
303  return (int)ctype;
304 }
305 
306 static int
308 {
309  if (ctype <= ONIGENC_MAX_STD_CTYPE) {
310  if (code < 128)
311  return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
312  else {
313  if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
314  return (code_to_mbclen(code, enc) > 1 ? TRUE : FALSE);
315  }
316  }
317  }
318  else {
320 
321  ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
322  if (ctype >= (unsigned int )PropertyListNum)
323  return ONIGERR_TYPE_BUG;
324 
325  return onig_is_in_code_range((UChar* )PropertyList[ctype], code);
326  }
327 
328  return FALSE;
329 }
330 
331 static int
333  const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED)
334 {
335  if (ctype <= ONIGENC_MAX_STD_CTYPE) {
336  return ONIG_NO_SUPPORT_CONFIG;
337  }
338  else {
339  *sb_out = 0x80;
340 
342 
343  ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
344  if (ctype >= (OnigCtype )PropertyListNum)
345  return ONIGERR_TYPE_BUG;
346 
347  *ranges = PropertyList[ctype];
348  return 0;
349  }
350 }
351 
352 
353 OnigEncodingDefine(euc_jp, EUC_JP) = {
354  mbc_enc_len,
355  "EUC-JP", /* name */
356  3, /* max enc length */
357  1, /* min enc length */
359  mbc_to_code,
361  code_to_mbc,
370  0
371 };
372 /*
373  * Name: EUC-JP
374  * MIBenum: 18
375  * Link: http://www.iana.org/assignments/character-sets
376  * Link: http://home.m05.itscom.net/numa/cde/sjis-euc/sjis-euc.html
377  * Link: http://home.m05.itscom.net/numa/uocjleE.pdf
378  */
379 ENC_ALIAS("eucJP", "EUC-JP") /* UI-OSF Application Platform Profile for Japanese Environment Version 1.1 */
380 
381 /*
382  * Name: eucJP-ms
383  * Link: http://home.m05.itscom.net/numa/cde/ucs-conv/ucs-conv.html
384  * Link: http://www2d.biglobe.ne.jp/~msyk/charcode/cp932/eucJP-ms.html
385  * Link: http://ja.wikipedia.org/wiki/EUC-JP
386  */
387 ENC_REPLICATE("eucJP-ms", "EUC-JP") /* TOG/JVC CDE/Motif Technical WG */
388 ENC_ALIAS("euc-jp-ms", "eucJP-ms")
389 
390 /*
391  * Name: CP51932
392  * MIBenum: 2108
393  * Link: http://www.iana.org/assignments/charset-reg/CP51932
394  * Link: http://search.cpan.org/src/NARUSE/Encode-EUCJPMS-0.07/ucm/cp51932.ucm
395  * Link: http://legacy-encoding.sourceforge.jp/wiki/index.php?cp51932
396  * Link: http://msyk.at.webry.info/200511/article_2.html
397  */
398 ENC_REPLICATE("CP51932", "EUC-JP")
static int mbc_enc_len(const UChar *p, const UChar *e, OnigEncoding enc ARG_UNUSED)
Definition: euc_jp.c:118
unsigned int OnigCodePoint
Definition: oniguruma.h:111
ssize_t n
Definition: bigdecimal.c:5519
#define OnigEncodingDefine(f, n)
static int property_name_to_ctype(OnigEncoding enc, UChar *p, UChar *end)
Definition: euc_jp.c:287
Definition: big5.c:90
static int PropertyListNum
Definition: euc_jp.c:257
#define FALSE
Definition: nkf.h:185
static const OnigCodePoint CR_Katakana[]
Definition: euc_jp.c:266
#define ONIGERR_TOO_BIG_WIDE_CHAR_VALUE
Definition: oniguruma.h:559
int onig_st_lookup_strend(hash_table_type *table, const UChar *str_key, const UChar *end_key, hash_data_type *value)
Definition: regparse.c:369
code
Definition: tcltklib.c:3375
#define ONIGENC_IS_MBC_ASCII(p)
Definition: oniguruma.h:222
unsigned int OnigCaseFoldType
Definition: oniguruma.h:117
static int PropertyListSize
Definition: euc_jp.c:258
int onig_is_in_code_range(const UChar *p, OnigCodePoint code)
Definition: regcomp.c:5669
SYMID SyckParser * p
Definition: yaml2byte.c:119
VALUE enc
Definition: tcltklib.c:10402
state_t
Definition: big5.c:90
int onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED, OnigApplyAllCaseFoldFunc f, void *arg, OnigEncoding enc ARG_UNUSED)
Definition: regenc.c:409
#define ARG_UNUSED
r
Definition: bigdecimal.c:1154
#define ONIGERR_TYPE_BUG
Definition: oniguruma.h:502
#define ENC_ALIAS(name, orig)
Definition: encdb.c:18
static const OnigCodePoint ** PropertyList
Definition: euc_jp.c:256
#define ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype)
flag
Definition: tcltklib.c:2039
static const OnigCodePoint CR_Hiragana[]
Definition: euc_jp.c:261
static const int EncLen_EUCJP[]
Definition: euc_jp.c:35
Definition: big5.c:90
static int is_allowed_reverse_match(const UChar *s, const UChar *end, OnigEncoding enc ARG_UNUSED)
Definition: euc_jp.c:245
#define enclen(enc, p, e)
Definition: big5.c:90
#define A
Definition: euc_jp.c:55
Definition: nkf.c:99
static const signed char trans[][0x100]
Definition: euc_jp.c:57
#define PROPERTY_LIST_ADD_PROP(Name, CR)
Definition: regint.h:826
#define ALLOCA_N(type, n)
Definition: ruby.h:1038
static int code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc)
Definition: euc_jp.c:185
unsigned char buf[MIME_BUF_SIZE]
Definition: nkf.c:3913
Definition: big5.c:90
#define ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n)
Definition: oniguruma.h:250
int onigenc_is_mbc_newline_0x0a(const UChar *p, const UChar *end, OnigEncoding enc ARG_UNUSED)
Definition: regenc.c:580
static int init_property_list(void)
Definition: euc_jp.c:274
#define TRUE
Definition: nkf.h:186
Definition: nkf.c:101
register char * s
Definition: os2.c:56
unsigned int OnigCtype
Definition: oniguruma.h:112
#define ONIG_NO_SUPPORT_CONFIG
Definition: oniguruma.h:498
#define ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(n)
Definition: oniguruma.h:243
#define eucjp_islead(c)
Definition: euc_jp.c:33
register unsigned int len
Definition: name2ctype.h:22210
#define F
Definition: euc_jp.c:56
int onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED, const OnigUChar *p, const OnigUChar *end ARG_UNUSED, OnigCaseFoldCodeItem items[], OnigEncoding enc ARG_UNUSED)
Definition: regenc.c:432
VpDivd * c
Definition: bigdecimal.c:1163
static OnigCodePoint mbc_to_code(const UChar *p, const UChar *end, OnigEncoding enc)
Definition: euc_jp.c:136
long st_data_t
Definition: syck.h:69
static int get_ctype_code_range(OnigCtype ctype, OnigCodePoint *sb_out, const OnigCodePoint *ranges[], OnigEncoding enc ARG_UNUSED)
Definition: euc_jp.c:332
#define UChar
Definition: oniguruma.h:107
gz end
Definition: zlib.c:2033
#define ONIGENC_MAX_STD_CTYPE
Definition: oniguruma.h:206
static int code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
Definition: euc_jp.c:154
#define ONIGENC_CONSTRUCT_MBCLEN_INVALID()
Definition: oniguruma.h:247
#define ENC_REPLICATE(name, orig)
Definition: encdb.c:17
static UChar * left_adjust_char_head(const UChar *start, const UChar *s, const UChar *end, OnigEncoding enc)
Definition: euc_jp.c:226
char * start
Definition: yaml2byte.c:126
static hash_table_type * PropertyNameTable
Definition: euc_jp.c:259
void hash_table_type
Definition: regint.h:813
Definition: emacs_mule.c:68
#define ONIGENC_ASCII_CODE_TO_LOWER_CASE(c)
static int PropertyInited
Definition: euc_jp.c:255
#define ONIGENC_IS_CODE_ASCII(code)
Definition: oniguruma.h:223
static int is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc ARG_UNUSED)
Definition: euc_jp.c:307
#define PROPERTY_LIST_INIT_CHECK
Definition: regint.h:832
BDIGIT e
Definition: bigdecimal.c:4946
ssize_t i
Definition: bigdecimal.c:5519
#define ONIGERR_INVALID_CODE_POINT_VALUE
Definition: oniguruma.h:557
#define CTYPE_IS_WORD_GRAPH_PRINT(ctype)
int onigenc_minimum_property_name_to_ctype(OnigEncoding enc, UChar *p, UChar *end)
Definition: regenc.c:790
static int mbc_case_fold(OnigCaseFoldType flag, const UChar **pp, const UChar *end, UChar *lower, OnigEncoding enc)
Definition: euc_jp.c:201