1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31: #include <stdio.h>
32: #include <stdlib.h>
33: #include <string.h>
34:
35: #include "config.h"
36:
37: #include <anthy/anthy.h>
38:
39: #include <anthy/xstr.h>
40: #include <anthy/xchar.h>
41: #include "diclib_inner.h"
42:
43:
44: static int print_encoding;
45:
46: #define MAX_BYTES_PER_XCHAR 10
47:
48: static int
49: xc_isprint(xchar xc)
50: {
51: return xc > 0;
52: }
53:
54:
55:
56: static int
57: xlengthofcstr(const char *c)
58: {
59: int ll = 0;
60: int len = strlen(c);
61: int i;
62: for (i = 0; i < len; i++) {
63: ll ++;
64: if ((c[i] & 0x80)) {
65: i++;
66: }
67: }
68: return ll;
69: }
70:
71: const char *
72: anthy_utf8_to_ucs4_xchar(const char *s, xchar *res)
73: {
74: const unsigned char *str = (const unsigned char *)s;
75: int i, len;
76: xchar cur;
77: cur = str[0];
78: if (str[0] < 0x80) {
79: len = 1;
80: } else if (str[0] < 0xe0) {
81: cur &= 0x1f;
82: len = 2;
83: } else if (str[0] < 0xf0) {
84: cur &= 0x0f;
85: len = 3;
86: } else if (str[0] < 0xf8) {
87: cur &= 0x07;
88: len = 4;
89: } else if (str[0] < 0xfc) {
90: cur &= 0x03;
91: len = 5;
92: } else {
93: cur &= 0x01;
94: len = 6;
95: }
96: str ++;
97: for (i = 1; i < len; i++) {
98: cur <<= 6;
99: cur |= (str[0] & 0x3f);
100: str++;
101: }
102: *res = cur;
103: return (const char *)str;
104: }
105:
106: static xstr *
107: utf8_to_ucs4_xstr(const char *s)
108: {
109: const unsigned char *str = (const unsigned char *)s;
110: xstr res;
111: res.str = (xchar *)alloca(sizeof(xchar) * strlen(s));
112: res.len = 0;
113:
114: while (*str) {
115: xchar cur;
116: str = (const unsigned char *)anthy_utf8_to_ucs4_xchar((const char *)str,
117: &cur);
118: res.str[res.len] = cur;
119: res.len ++;
120: }
121: return anthy_xstr_dup(&res);
122: }
123:
124: static int
125: put_xchar_to_utf8_str(xchar xc, char *buf_)
126: {
127: int i, len;
128: unsigned char *buf = (unsigned char *)buf_;
129: if (xc < 0x80) {
130: buf[0] = 0;
131: len = 1;
132: } else if (xc < 0x800) {
133: buf[0] = 0xc0;
134: len = 2;
135: } else if (xc < 0x10000) {
136: buf[0] = 0xe0;
137: len = 3;
138: } else if (xc < 0x200000) {
139: buf[0] = 0xf0;
140: len = 4;
141: } else if (xc < 0x400000) {
142: buf[0] = 0xf8;
143: len = 5;
144: } else {
145: buf[0] = 0xfc;
146: len = 6;
147: }
148: for (i = len - 1; i > 0; i--) {
149: buf[i] = (xc & 0x3f) | 0x80;
150: xc >>= 6;
151: }
152: buf[0] += xc;
153: buf[len] = 0;
154: return len;
155: }
156:
157: static char *
158: ucs4_xstr_to_utf8(xstr *xs)
159: {
160: char *buf = alloca(xs->len * 6 + 1);
161: int i, t = 0;
162: buf[0] = 0;
163: for (i = 0; i < xs->len; i++) {
164: xchar xc = xs->str[i];
165: put_xchar_to_utf8_str(xc, &buf[t]);
166: t = strlen(buf);
167: }
168: return strdup(buf);
169: }
170:
171:
172:
173: xstr *
174: anthy_cstr_to_xstr(const char *c, int encoding)
175: {
176: xstr *x;
177: int i, j, l;
178: if (encoding == ANTHY_UTF8_ENCODING) {
179: return utf8_to_ucs4_xstr(c);
180: }
181: l = xlengthofcstr(c);
182: x = (xstr *)malloc(sizeof(struct xstr_));
183: if (!x) {
184: return NULL;
185: }
186: x->len = l;
187: x->str = malloc(sizeof(xchar)*l);
188: for (i = 0, j = 0; i < l; i++) {
189: if (!(c[j] & 0x80)){
190: x->str[i] = c[j];
191: j++;
192: } else {
193: unsigned char *p = (unsigned char *)&c[j];
194: x->str[i] = (p[1] | (p[0]<<8)) | 0x8080;
195: x->str[i] = anthy_euc_to_ucs(x->str[i]);
196: j++;
197: j++;
198: }
199: }
200: return x;
201: }
202:
203: char *
204: anthy_xstr_to_cstr(xstr *s, int encoding)
205: {
206: int i, j, l;
207: char *p;
208:
209: if (encoding == ANTHY_UTF8_ENCODING) {
210: return ucs4_xstr_to_utf8(s);
211: }
212:
213: l = s->len;
214: for (i = 0; i < s->len; i++) {
215: int ec = anthy_ucs_to_euc(s->str[i]);
216: if (ec > 255) {
217: l++;
218: }
219: }
220: p = (char *)malloc(l + 1);
221: p[l] = 0;
222: j = 0;
223: for (i = 0; i < s->len; i++) {
224: int ec = anthy_ucs_to_euc(s->str[i]);
225: if (ec < 256) {
226: p[j] = ec;
227: j++;
228: }else{
229: p[j] = ec >> 8;
230: j++;
231: p[j] = ec & 255;
232: j++;
233: }
234: }
235: return p;
236: }
237:
238: xstr *
239: anthy_xstr_dup(xstr *s)
240: {
241: int i;
242: xstr *x = (xstr *)malloc(sizeof(xstr));
243: x->len = s->len;
244: if (s->len) {
245: x->str = malloc(sizeof(xchar)*s->len);
246: }else{
247: x->str = NULL;
248: }
249: for (i = 0; i < x->len; i++) {
250: x->str[i] = s->str[i];
251: }
252: return x;
253: }
254:
255: xchar *
256: anthy_xstr_dup_str(xstr *s)
257: {
258: xchar *c;
259: int i;
260: if (s->len) {
261: c = malloc(sizeof(xchar)*s->len);
262: }else{
263: c = 0;
264: }
265: for (i = 0; i < s->len; i++) {
266: c[i] = s->str[i];
267: }
268: return c;
269: }
270:
271: void
272: anthy_free_xstr(xstr *x)
273: {
274: if (!x) {
275: return ;
276: }
277:
278: free(x->str);
279: free(x);
280: }
281:
282: void
283: anthy_free_xstr_str(xstr *x)
284: {
285: if (!x) {
286: return ;
287: }
288: free(x->str);
289: }
290:
291: int
292: anthy_sputxchar(char *buf, xchar x, int encoding)
293: {
294: if (!xc_isprint(x)) {
295: sprintf(buf, "??");
296: return 2;
297: }
298: if (encoding == ANTHY_UTF8_ENCODING) {
299: return put_xchar_to_utf8_str(x, buf);
300: }
301: x = anthy_ucs_to_euc(x);
302: if (x < 256) {
303: buf[0] = x;
304: buf[1] = 0;
305: return 1;
306: }
307: buf[2] = 0;
308: buf[1] = 0x80 | (x & 255);
309: buf[0] = 0x80 | ((x>>8) & 255);
310: return 2;
311: }
312:
313: int
314: anthy_sputxstr(char *buf, xstr *x, int encoding)
315: {
316: char b[MAX_BYTES_PER_XCHAR];
317: int i, l = 0;
318: for (i = 0; i < x->len; i++) {
319: anthy_sputxchar(b, x->str[i], encoding);
320: sprintf(&buf[l], "%s", b);
321: l += strlen(b);
322: }
323: return l;
324: }
325:
326: int
327: anthy_snputxstr(char *buf, int n, xstr *x, int encoding)
328: {
329: char b[MAX_BYTES_PER_XCHAR];
330: int i, l=0;
331: for (i = 0; i < x->len; i++) {
332: anthy_sputxchar(b, x->str[i], encoding);
333: if ((int)strlen(b) + l >= n) {
334: return l;
335: }
336: n -= sprintf(&buf[l], "%s", b);
337: l += strlen(b);
338: }
339: return l;
340: }
341:
342: void
343: anthy_putxchar(xchar x)
344: {
345: char buf[MAX_BYTES_PER_XCHAR];
346: if (!xc_isprint(x)) {
347: printf("\\%x", x);
348: return ;
349: }
350: anthy_sputxchar(buf, x, print_encoding);
351: printf("%s", buf);
352: }
353:
354: void
355: anthy_putxstr(xstr *x)
356: {
357: int i;
358: for (i = 0; i < x->len; i++) {
359: anthy_putxchar(x->str[i]);
360: }
361: }
362:
363: void
364: anthy_putxstrln(xstr *x)
365: {
366: anthy_putxstr(x);
367: printf("\n");
368: }
369:
370: xstr*
371: anthy_xstrcpy(xstr *dest, xstr *src)
372: {
373: int i;
374:
375: dest->len = src->len;
376: for (i = 0; i < src->len; i++) {
377: dest->str[i] = src->str[i];
378: }
379:
380: return dest;
381: }
382:
383: int
384: anthy_xstrcmp(xstr *x1, xstr *x2)
385: {
386: int i, m;
387: if (x1->len < x2->len) {
388: m = x1->len;
389: }else{
390: m = x2->len;
391: }
392: for (i = 0 ; i < m ; i++) {
393: if (x1->str[i] < x2->str[i]) {
394: return -1;
395: }
396: if (x1->str[i] > x2->str[i]) {
397: return 1;
398: }
399: }
400: if (x1->len < x2->len) {
401: return -1;
402: }
403: if (x1->len > x2->len) {
404: return 1;
405: }
406: return 0;
407: }
408:
409:
410: int
411: anthy_xstrncmp(xstr *x1, xstr *x2, int n)
412: {
413: int i, m;
414: if (x1->len < x2->len) {
415: m = x1->len;
416: }else{
417: m = x2->len;
418: }
419: if (m > n) m = n;
420: for (i = 0 ; i < m ; i++) {
421: if (x1->str[i] < x2->str[i]) {
422: return -1;
423: }
424: if (x1->str[i] > x2->str[i]) {
425: return 1;
426: }
427: }
428: if (x2->len <= n && x1->len < x2->len) {
429: return -1;
430: }
431: if (x1->len <= n && x1->len > x2->len) {
432: return 1;
433: }
434: return 0;
435: }
436:
437:
438: xstr *
439: anthy_xstrcat(xstr *s, xstr *a)
440: {
441: int i, l;
442: if (!s) {
443: s = malloc(sizeof(xstr));
444: s->str = NULL;
445: s->len = 0;
446: }
447: l = s->len + a->len;
448: s->str = realloc(s->str, sizeof(xchar)*l);
449: for (i = 0; i < a->len; i ++) {
450: s->str[s->len+i] = a->str[i];
451: }
452: s->len = l;
453: return s;
454: }
455:
456: xstr *
457: anthy_xstrappend(xstr *xs, xchar xc)
458: {
459: xstr p;
460: xchar q[1];
461: p.len = 1;
462: p.str = q;
463: q[0] = xc;
464: return anthy_xstrcat(xs, &p);
465: }
466:
467: long long
468: anthy_xstrtoll(xstr *x)
469: {
470: xchar c;
471: int i;
472: long long n = 0;
473: if (!x->len || x->len > 16) {
474: return -1;
475: }
476: if (!anthy_get_xstr_type(x) & (XCT_NUM | XCT_WIDENUM)) {
477: return -1;
478: }
479: for (i = 0; i < x->len; i++) {
480: c = x->str[i];
481: n *= 10;
482: n += anthy_xchar_to_num(c);
483: }
484: return n;
485: }
486:
487:
488:
489: xstr *
490: anthy_xstr_wide_num_to_num(xstr* src_xs)
491: {
492: int i;
493: xstr *dst_xs;
494: dst_xs = anthy_xstr_dup(src_xs);
495: for (i = 0; i < src_xs->len; ++i) {
496: dst_xs->str[i] = anthy_xchar_wide_num_to_num(src_xs->str[i]);
497: }
498: return dst_xs;
499: }
500:
501:
502:
503: xstr *
504: anthy_xstr_hira_to_kata(xstr *src_xs)
505: {
506: xstr *dst_xs;
507: int i, j;
508: dst_xs = anthy_xstr_dup(src_xs);
509:
510: for (i = 0 ,j = 0; i < dst_xs->len; i++, j++) {
511:
512: if (i < dst_xs->len - 1 && dst_xs->str[i] == HK_U
513: && dst_xs->str[i+1] == HK_DDOT) {
514: dst_xs->str[j] = KK_VU;
515: i++;
516: continue ;
517: }
518:
519: dst_xs->str[j] = dst_xs->str[i];
520: if ((anthy_ucs_to_euc(dst_xs->str[j]) & 0xff00) == 0xa400) {
521:
522: dst_xs->str[j] = anthy_ucs_to_euc(dst_xs->str[j]);
523: dst_xs->str[j] += 256;
524: dst_xs->str[j] = anthy_euc_to_ucs(dst_xs->str[j]);
525: }
526: }
527: dst_xs->len = j;
528: return dst_xs;
529: }
530:
531: xstr *
532: anthy_xstr_hira_to_half_kata(xstr *src_xs)
533: {
534: int len = src_xs->len;
535: int i, j;
536: xstr *xs;
537: for (i = 0; i < src_xs->len; i++) {
538: const struct half_kana_table *tab = anthy_find_half_kana(src_xs->str[i]);
539: if (tab && tab->mod) {
540: len ++;
541: }
542: }
543: xs = malloc(sizeof(xstr));
544: xs->len = len;
545: xs->str = malloc(sizeof(xchar) * len);
546: j = 0;
547: for (i = 0; i < src_xs->len; i++) {
548: const struct half_kana_table *tab = anthy_find_half_kana(src_xs->str[i]);
549: if (tab) {
550: xs->str[j] = anthy_euc_to_ucs(tab->dst);
551: if (tab->mod) {
552: j++;
553: xs->str[j] = anthy_euc_to_ucs(tab->mod);
554: }
555: } else {
556: xs->str[j] = src_xs->str[i];
557: }
558: j++;
559: }
560: return xs;
561: }
562:
563: xstr *
564: anthy_conv_half_wide(xstr *xs)
565: {
566: int i;
567: xstr *res;
568: for (i = 0; i < xs->len; i++) {
569: if (!anthy_lookup_half_wide(xs->str[i])) {
570: return NULL;
571: }
572: }
573: res = anthy_xstr_dup(xs);
574: for (i = 0; i < xs->len; i++) {
575: res->str[i] = anthy_lookup_half_wide(xs->str[i]);
576: }
577: return res;
578: }
579:
580: int
581: anthy_xstr_hash(xstr *xs)
582: {
583: int h,i;
584: h = 0;
585: for (i = 0 ;i < xs->len ;i++) {
586: h *= 97;
587: h += xs->str[i]<<4;