Ruby  1.9.3p551(2014-11-13revision48407)
nkf.c
Go to the documentation of this file.
1 /*
2  * NKF - Ruby extension for Network Kanji Filter
3  *
4  * original nkf2.x is maintained at http://sourceforge.jp/projects/nkf/
5  *
6  * $Id: nkf.c 27947 2010-05-21 10:11:44Z nobu $
7  *
8  */
9 
10 #define RUBY_NKF_REVISION "$Revision: 27947 $"
11 #define RUBY_NKF_VERSION NKF_VERSION " (" NKF_RELEASE_DATE ")"
12 
13 #include "ruby/ruby.h"
14 #include "ruby/encoding.h"
15 
16 /* Replace nkf's getchar/putchar for variable modification */
17 /* we never use getc, ungetc */
18 
19 #undef getc
20 #undef ungetc
21 #define getc(f) (input_ctr>=i_len?-1:input[input_ctr++])
22 #define ungetc(c,f) input_ctr--
23 
24 #define INCSIZE 32
25 #undef putchar
26 #undef TRUE
27 #undef FALSE
28 #define putchar(c) rb_nkf_putchar(c)
29 
30 /* Input/Output pointers */
31 
32 static unsigned char *output;
33 static unsigned char *input;
34 static int input_ctr;
35 static int i_len;
36 static int output_ctr;
37 static int o_len;
38 static int incsize;
39 
40 static VALUE result;
41 
42 static int
43 rb_nkf_putchar(unsigned int c)
44 {
45  if (output_ctr >= o_len) {
46  o_len += incsize;
48  incsize *= 2;
49  output = (unsigned char *)RSTRING_PTR(result);
50  }
51  output[output_ctr++] = c;
52 
53  return c;
54 }
55 
56 /* Include kanji filter main part */
57 /* getchar and putchar will be replaced during inclusion */
58 
59 #define PERL_XS 1
60 #include "nkf-utf8/config.h"
61 #include "nkf-utf8/utf8tbl.c"
62 #include "nkf-utf8/nkf.c"
63 
65 {
66  int idx = rb_enc_find_index(name);
67  if (idx < 0) {
68  nkf_encoding *nkf_enc = nkf_enc_find(name);
70  if (idx < 0) {
71  idx = rb_define_dummy_encoding(name);
72  }
73  }
74  return rb_enc_from_index(idx);
75 }
76 
77 int nkf_split_options(const char *arg)
78 {
79  int count = 0;
80  unsigned char option[256];
81  int i = 0, j = 0;
82  int is_escaped = FALSE;
83  int is_single_quoted = FALSE;
84  int is_double_quoted = FALSE;
85  for(i = 0; arg[i]; i++){
86  if(j == 255){
87  return -1;
88  }else if(is_single_quoted){
89  if(arg[i] == '\''){
90  is_single_quoted = FALSE;
91  }else{
92  option[j++] = arg[i];
93  }
94  }else if(is_escaped){
95  is_escaped = FALSE;
96  option[j++] = arg[i];
97  }else if(arg[i] == '\\'){
98  is_escaped = TRUE;
99  }else if(is_double_quoted){
100  if(arg[i] == '"'){
101  is_double_quoted = FALSE;
102  }else{
103  option[j++] = arg[i];
104  }
105  }else if(arg[i] == '\''){
106  is_single_quoted = TRUE;
107  }else if(arg[i] == '"'){
108  is_double_quoted = TRUE;
109  }else if(arg[i] == ' '){
110  option[j] = '\0';
111  options(option);
112  j = 0;
113  }else{
114  option[j++] = arg[i];
115  }
116  }
117  if(j){
118  option[j] = '\0';
119  options(option);
120  }
121  return count;
122 }
123 
124 /*
125  * call-seq:
126  * NKF.nkf(opt, str) => string
127  *
128  * Convert _str_ and return converted result.
129  * Conversion details are specified by _opt_ as String.
130  *
131  * require 'nkf'
132  * output = NKF.nkf("-s", input)
133  */
134 
135 static VALUE
137 {
138  volatile VALUE tmp;
139  reinit();
140  StringValue(opt);
142  if (!output_encoding) rb_raise(rb_eArgError, "no output encoding given");
143 
150  }
152 
153  incsize = INCSIZE;
154 
155  input_ctr = 0;
156  StringValue(src);
157  input = (unsigned char *)RSTRING_PTR(src);
158  i_len = RSTRING_LENINT(src);
159  tmp = result = rb_str_new(0, i_len*3 + 10);
160 
161  output_ctr = 0;
162  output = (unsigned char *)RSTRING_PTR(result);
164  *output = '\0';
165 
168  OBJ_INFECT(result, src);
169 
170  if (mimeout_f)
172  else
174 
175  return result;
176 }
177 
178 
179 /*
180  * call-seq:
181  * NKF.guess(str) => encoding
182  *
183  * Returns guessed encoding of _str_ by nkf routine.
184  *
185  */
186 
187 static VALUE
189 {
190  reinit();
191 
192  input_ctr = 0;
193  StringValue(src);
194  input = (unsigned char *)RSTRING_PTR(src);
195  i_len = RSTRING_LENINT(src);
196 
197  guess_f = TRUE;
198  kanji_convert( NULL );
199  guess_f = FALSE;
200 
202 }
203 
204 
205 /*
206  * NKF - Ruby extension for Network Kanji Filter
207  *
208  * == Description
209  *
210  * This is a Ruby Extension version of nkf (Network Kanji Filter).
211  * It converts the first argument and returns converted result. Conversion
212  * details are specified by flags as the first argument.
213  *
214  * *Nkf* is a yet another kanji code converter among networks, hosts and terminals.
215  * It converts input kanji code to designated kanji code
216  * such as ISO-2022-JP, Shift_JIS, EUC-JP, UTF-8 or UTF-16.
217  *
218  * One of the most unique faculty of *nkf* is the guess of the input kanji encodings.
219  * It currently recognizes ISO-2022-JP, Shift_JIS, EUC-JP, UTF-8 and UTF-16.
220  * So users needn't set the input kanji code explicitly.
221  *
222  * By default, X0201 kana is converted into X0208 kana.
223  * For X0201 kana, SO/SI, SSO and ESC-(-I methods are supported.
224  * For automatic code detection, nkf assumes no X0201 kana in Shift_JIS.
225  * To accept X0201 in Shift_JIS, use <b>-X</b>, <b>-x</b> or <b>-S</b>.
226  *
227  * == Flags
228  *
229  * === -b -u
230  *
231  * Output is buffered (DEFAULT), Output is unbuffered.
232  *
233  * === -j -s -e -w -w16 -w32
234  *
235  * Output code is ISO-2022-JP (7bit JIS), Shift_JIS, EUC-JP,
236  * UTF-8N, UTF-16BE, UTF-32BE.
237  * Without this option and compile option, ISO-2022-JP is assumed.
238  *
239  * === -J -S -E -W -W16 -W32
240  *
241  * Input assumption is JIS 7 bit, Shift_JIS, EUC-JP,
242  * UTF-8, UTF-16, UTF-32.
243  *
244  * ==== -J
245  *
246  * Assume JIS input. It also accepts EUC-JP.
247  * This is the default. This flag does not exclude Shift_JIS.
248  *
249  * ==== -S
250  *
251  * Assume Shift_JIS and X0201 kana input. It also accepts JIS.
252  * EUC-JP is recognized as X0201 kana. Without <b>-x</b> flag,
253  * X0201 kana (halfwidth kana) is converted into X0208.
254  *
255  * ==== -E
256  *
257  * Assume EUC-JP input. It also accepts JIS.
258  * Same as -J.
259  *
260  * === -t
261  *
262  * No conversion.
263  *
264  * === -i_
265  *
266  * Output sequence to designate JIS-kanji. (DEFAULT B)
267  *
268  * === -o_
269  *
270  * Output sequence to designate ASCII. (DEFAULT B)
271  *
272  * === -r
273  *
274  * {de/en}crypt ROT13/47
275  *
276  * === -h[123] --hiragana --katakana --katakana-hiragana
277  *
278  * [-h1 --hiragana] Katakana to Hiragana conversion.
279  *
280  * [-h2 --katakana] Hiragana to Katakana conversion.
281  *
282  * [-h3 --katakana-hiragana] Katakana to Hiragana and Hiragana to Katakana conversion.
283  *
284  * === -T
285  *
286  * Text mode output (MS-DOS)
287  *
288  * === -l
289  *
290  * ISO8859-1 (Latin-1) support
291  *
292  * === -f[<code>m</code> [- <code>n</code>]]
293  *
294  * Folding on <code>m</code> length with <code>n</code> margin in a line.
295  * Without this option, fold length is 60 and fold margin is 10.
296  *
297  * === -F
298  *
299  * New line preserving line folding.
300  *
301  * === -Z[0-3]
302  *
303  * Convert X0208 alphabet (Fullwidth Alphabets) to ASCII.
304  *
305  * [-Z -Z0] Convert X0208 alphabet to ASCII.
306  *
307  * [-Z1] Converts X0208 kankaku to single ASCII space.
308  *
309  * [-Z2] Converts X0208 kankaku to double ASCII spaces.
310  *
311  * [-Z3] Replacing Fullwidth >, <, ", & into '&gt;', '&lt;', '&quot;', '&amp;' as in HTML.
312  *
313  * === -X -x
314  *
315  * Assume X0201 kana in MS-Kanji.
316  * With <b>-X</b> or without this option, X0201 is converted into X0208 Kana.
317  * With <b>-x</b>, try to preserve X0208 kana and do not convert X0201 kana to X0208.
318  * In JIS output, ESC-(-I is used. In EUC output, SSO is used.
319  *
320  * === -B[0-2]
321  *
322  * Assume broken JIS-Kanji input, which lost ESC.
323  * Useful when your site is using old B-News Nihongo patch.
324  *
325  * [-B1] allows any char after ESC-( or ESC-$.
326  *
327  * [-B2] forces ASCII after NL.
328  *
329  * === -I
330  *
331  * Replacing non iso-2022-jp char into a geta character
332  * (substitute character in Japanese).
333  *
334  * === -d -c
335  *
336  * Delete \r in line feed, Add \r in line feed.
337  *
338  * === -m[BQN0]
339  *
340  * MIME ISO-2022-JP/ISO8859-1 decode. (DEFAULT)
341  * To see ISO8859-1 (Latin-1) -l is necessary.
342  *
343  * [-mB] Decode MIME base64 encoded stream. Remove header or other part before
344  * conversion.
345  *
346  * [-mQ] Decode MIME quoted stream. '_' in quoted stream is converted to space.
347  *
348  * [-mN] Non-strict decoding.
349  * It allows line break in the middle of the base64 encoding.
350  *
351  * [-m0] No MIME decode.
352  *
353  * === -M
354  *
355  * MIME encode. Header style. All ASCII code and control characters are intact.
356  * Kanji conversion is performed before encoding, so this cannot be used as a picture encoder.
357  *
358  * [-MB] MIME encode Base64 stream.
359  *
360  * [-MQ] Perfome quoted encoding.
361  *
362  * === -l
363  *
364  * Input and output code is ISO8859-1 (Latin-1) and ISO-2022-JP.
365  * <b>-s</b>, <b>-e</b> and <b>-x</b> are not compatible with this option.
366  *
367  * === -L[uwm]
368  *
369  * new line mode
370  * Without this option, nkf doesn't convert line breaks.
371  *
372  * [-Lu] unix (LF)
373  *
374  * [-Lw] windows (CRLF)
375  *
376  * [-Lm] mac (CR)
377  *
378  * === --fj --unix --mac --msdos --windows
379  *
380  * convert for these system
381  *
382  * === --jis --euc --sjis --mime --base64
383  *
384  * convert for named code
385  *
386  * === --jis-input --euc-input --sjis-input --mime-input --base64-input
387  *
388  * assume input system
389  *
390  * === --ic=<code>input codeset</code> --oc=<code>output codeset</code>
391  *
392  * Set the input or output codeset.
393  * NKF supports following codesets and those codeset name are case insensitive.
394  *
395  * [ISO-2022-JP] a.k.a. RFC1468, 7bit JIS, JUNET
396  *
397  * [EUC-JP (eucJP-nkf)] a.k.a. AT&T JIS, Japanese EUC, UJIS
398  *
399  * [eucJP-ascii] a.k.a. x-eucjp-open-19970715-ascii
400  *
401  * [eucJP-ms] a.k.a. x-eucjp-open-19970715-ms
402  *
403  * [CP51932] Microsoft Version of EUC-JP.
404  *
405  * [Shift_JIS] SJIS, MS-Kanji
406  *
407  * [Windows-31J] a.k.a. CP932
408  *
409  * [UTF-8] same as UTF-8N
410  *
411  * [UTF-8N] UTF-8 without BOM
412  *
413  * [UTF-8-BOM] UTF-8 with BOM
414  *
415  * [UTF-16] same as UTF-16BE
416  *
417  * [UTF-16BE] UTF-16 Big Endian without BOM
418  *
419  * [UTF-16BE-BOM] UTF-16 Big Endian with BOM
420  *
421  * [UTF-16LE] UTF-16 Little Endian without BOM
422  *
423  * [UTF-16LE-BOM] UTF-16 Little Endian with BOM
424  *
425  * [UTF-32] same as UTF-32BE
426  *
427  * [UTF-32BE] UTF-32 Big Endian without BOM
428  *
429  * [UTF-32BE-BOM] UTF-32 Big Endian with BOM
430  *
431  * [UTF-32LE] UTF-32 Little Endian without BOM
432  *
433  * [UTF-32LE-BOM] UTF-32 Little Endian with BOM
434  *
435  * [UTF8-MAC] NKDed UTF-8, a.k.a. UTF8-NFD (input only)
436  *
437  * === --fb-{skip, html, xml, perl, java, subchar}
438  *
439  * Specify the way that nkf handles unassigned characters.
440  * Without this option, --fb-skip is assumed.
441  *
442  * === --prefix= <code>escape character</code> <code>target character</code> ..
443  *
444  * When nkf converts to Shift_JIS,
445  * nkf adds a specified escape character to specified 2nd byte of Shift_JIS characters.
446  * 1st byte of argument is the escape character and following bytes are target characters.
447  *
448  * === --no-cp932ext
449  *
450  * Handle the characters extended in CP932 as unassigned characters.
451  *
452  * == --no-best-fit-chars
453  *
454  * When Unicode to Encoded byte conversion,
455  * don't convert characters which is not round trip safe.
456  * When Unicode to Unicode conversion,
457  * with this and -x option, nkf can be used as UTF converter.
458  * (In other words, without this and -x option, nkf doesn't save some characters)
459  *
460  * When nkf convert string which related to path, you should use this opion.
461  *
462  * === --cap-input
463  *
464  * Decode hex encoded characters.
465  *
466  * === --url-input
467  *
468  * Unescape percent escaped characters.
469  *
470  * === --
471  *
472  * Ignore rest of -option.
473  */
474 
475 void
477 {
478  VALUE mNKF = rb_define_module("NKF");
479 
480  rb_define_module_function(mNKF, "nkf", rb_nkf_convert, 2);
481  rb_define_module_function(mNKF, "guess", rb_nkf_guess, 1);
482  rb_define_alias(rb_singleton_class(mNKF), "guess", "guess");
483 
484  rb_define_const(mNKF, "AUTO", Qnil);
485  rb_define_const(mNKF, "NOCONV", Qnil);
486  rb_define_const(mNKF, "UNKNOWN", Qnil);
487  rb_define_const(mNKF, "BINARY", rb_enc_from_encoding(rb_nkf_enc_get("BINARY")));
488  rb_define_const(mNKF, "ASCII", rb_enc_from_encoding(rb_nkf_enc_get("US-ASCII")));
489  rb_define_const(mNKF, "JIS", rb_enc_from_encoding(rb_nkf_enc_get("ISO-2022-JP")));
490  rb_define_const(mNKF, "EUC", rb_enc_from_encoding(rb_nkf_enc_get("EUC-JP")));
491  rb_define_const(mNKF, "SJIS", rb_enc_from_encoding(rb_nkf_enc_get("Shift_JIS")));
493  rb_define_const(mNKF, "UTF16", rb_enc_from_encoding(rb_nkf_enc_get("UTF-16BE")));
494  rb_define_const(mNKF, "UTF32", rb_enc_from_encoding(rb_nkf_enc_get("UTF-32BE")));
495 
496  /* Full version string of nkf */
497  rb_define_const(mNKF, "VERSION", rb_str_new2(RUBY_NKF_VERSION));
498  /* Version of nkf */
499  rb_define_const(mNKF, "NKF_VERSION", rb_str_new2(NKF_VERSION));
500  /* Release date of nkf */
501  rb_define_const(mNKF, "NKF_RELEASE_DATE", rb_str_new2(NKF_RELEASE_DATE));
502 }
static int incsize
Definition: nkf.c:38
#define FALSE
Definition: nkf.h:185
int idx
Definition: tcltklib.c:9703
static VALUE rb_nkf_convert(VALUE obj, VALUE opt, VALUE src)
Definition: nkf.c:136
rb_encoding * rb_nkf_enc_get(const char *name)
Definition: nkf.c:64
int count
Definition: encoding.c:50
#define NKF_RELEASE_DATE
Definition: nkf.c:24
static int rb_nkf_putchar(unsigned int c)
Definition: nkf.c:43
#define nkf_enc_name(enc)
Definition: nkf.c:734
VALUE rb_enc_from_encoding(rb_encoding *encoding)
Definition: encoding.c:102
unsigned long VALUE
Definition: ruby.h:88
Definition: nkf.c:115
#define RSTRING_PTR(string)
Definition: generator.h:42
void rb_raise(VALUE exc, const char *fmt,...)
Definition: error.c:1574
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Definition: encoding.c:727
static VALUE rb_nkf_guess(VALUE obj, VALUE src)
Definition: nkf.c:188
static void reinit(void)
Definition: nkf.c:5176
static int kanji_convert(FILE *f)
Definition: nkf.c:5431
rb_encoding * rb_utf8_encoding(void)
Definition: encoding.c:1125
static int mimeout_f
Definition: nkf.c:401
#define RUBY_NKF_VERSION
Definition: nkf.c:11
static unsigned char * output
Definition: nkf.c:32
#define Qnil
Definition: ruby.h:367
VALUE rb_singleton_class(VALUE obj)
Returns the singleton class of obj.
Definition: class.c:1316
#define NKF_VERSION
Definition: nkf.c:23
n NULL
Definition: yaml2byte.c:134
static int output_ctr
Definition: nkf.c:36
unsigned int input
Definition: nkf.c:3916
static int output_bom_f
Definition: nkf.c:364
static VALUE VALUE obj
Definition: tcltklib.c:3147
void rb_define_const(VALUE, const char *, VALUE)
Definition: variable.c:1923
#define INCSIZE
Definition: nkf.c:24
void Init_nkf()
Definition: nkf.c:476
static const char * get_guessed_code(void)
Definition: nkf.c:4155
static int i_len
Definition: nkf.c:35
VALUE rb_str_resize(VALUE, long)
Definition: string.c:1779
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition: class.c:1385
void rb_define_module_function(VALUE module, const char *name, VALUE(*func)(ANYARGS), int argc)
Defines a module function for module.
Definition: class.c:1358
#define TRUE
Definition: nkf.h:186
#define nkf_enc_to_base_encoding(enc)
Definition: nkf.c:736
static int input_ctr
Definition: nkf.c:34
rb_encoding * rb_usascii_encoding(void)
Definition: encoding.c:1140
#define nkf_enc_to_index(enc)
Definition: nkf.c:735
int rb_define_dummy_encoding(const char *name)
Definition: encoding.c:375
static VALUE result
Definition: nkf.c:40
VpDivd * c
Definition: bigdecimal.c:1163
arg
Definition: ripper.y:1287
VALUE src
Definition: tcltklib.c:7940
static int o_len
Definition: nkf.c:37
static nkf_encoding * nkf_enc_find(const char *name)
Definition: nkf.c:726
#define rb_str_set_len(str, length)
Definition: ruby_missing.h:30
#define OBJ_INFECT(x, s)
Definition: ruby.h:967
Definition: nkf.c:110
Definition: nkf.c:113
options
Definition: tcltklib.c:4470
int rb_enc_find_index(const char *name)
Definition: encoding.c:596
#define RSTRING_LENINT(str)
Definition: ruby.h:684
Definition: nkf.c:108
ssize_t i
Definition: bigdecimal.c:5519
VALUE rb_define_module(const char *name)
Definition: class.c:587
int nkf_split_options(const char *arg)
Definition: nkf.c:77
static nkf_encoding * output_encoding
Definition: nkf.c:338
const char * name
Definition: nkf.c:208
Definition: nkf.c:118
VALUE rb_str_new2(const char *)
static int guess_f
Definition: nkf.c:447
VALUE rb_eArgError
Definition: error.c:468
#define StringValue(v)
Definition: ruby.h:466
static nkf_encoding * nkf_enc_from_index(int idx)
Definition: nkf.c:704
rb_encoding * rb_enc_from_index(int index)
Definition: encoding.c:512
Definition: nkf.c:120
VALUE rb_str_new(const char *, long)
Definition: string.c:410