1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25: #include <stdio.h>
26: #include <stdlib.h>
27: #include <stdbool.h>
28: #include <string.h>
29: #include <time.h>
30:
31:
32: struct unicode_attribute
33: {
34: const char *name;
35: const char *category;
36: const char *combining;
37: const char *bidi;
38: const char *decomposition;
39: const char *decdigit;
40: const char *digit;
41: const char *numeric;
42: int mirrored;
43: const char *oldname;
44: const char *comment;
45: unsigned int upper;
46: unsigned int lower;
47: unsigned int title;
48: };
49:
50:
51:
52: #define NONE (~(unsigned int)0)
53:
54:
55: struct unicode_attribute unicode_attributes [0x110000];
56:
57:
58: static void
59: fill_attribute (unsigned int i,
60: const char *field1, const char *field2,
61: const char *field3, const char *field4,
62: const char *field5, const char *field6,
63: const char *field7, const char *field8,
64: const char *field9, const char *field10,
65: const char *field11, const char *field12,
66: const char *field13, const char *field14)
67: {
68: struct unicode_attribute * uni;
69:
70: if (i >= 0x110000)
71: {
72: fprintf (stderr, "index too large\n");
73: exit (1);
74: }
75: if (strcmp (field2, "Cs") == 0)
76:
77: return;
78: uni = &unicode_attributes[i];
79:
80: uni->name = strdup (field1);
81: uni->category = (field2[0] == '\0' ? "" : strdup (field2));
82: uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
83: uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
84: uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
85: uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
86: uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
87: uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
88: uni->mirrored = (field9[0] == 'Y');
89: uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
90: uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
91: uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
92: uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
93: uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
94: }
95:
96:
97: #define FIELDLEN 120
98:
99:
100:
101:
102: static int
103: getfield (FILE *stream, char *buffer, int delim)
104: {
105: int count = 0;
106: int c;
107:
108: for (; (c = getc (stream)), (c != EOF && c != delim); )
109: {
110:
111:
112: if (c == '\r')
113: continue;
114:
115:
116: if (++count >= FIELDLEN - 1)
117: {
118: fprintf (stderr, "field too long\n");
119: exit (1);
120: }
121: *buffer++ = c;
122: }
123:
124: if (c == EOF)
125: return 0;
126:
127: *buffer = '\0';
128: return 1;
129: }
130:
131:
132:
133: static void
134: fill_attributes (const char *unicodedata_filename)
135: {
136: unsigned int i, j;
137: FILE *stream;
138: char field0[FIELDLEN];
139: char field1[FIELDLEN];
140: char field2[FIELDLEN];
141: char field3[FIELDLEN];
142: char field4[FIELDLEN];
143: char field5[FIELDLEN];
144: char field6[FIELDLEN];
145: char field7[FIELDLEN];
146: char field8[FIELDLEN];
147: char field9[FIELDLEN];
148: char field10[FIELDLEN];
149: char field11[FIELDLEN];
150: char field12[FIELDLEN];
151: char field13[FIELDLEN];
152: char field14[FIELDLEN];
153: int lineno = 0;
154:
155: for (i = 0; i < 0x110000; i++)
156: unicode_attributes[i].name = NULL;
157:
158: stream = fopen (unicodedata_filename, "r");
159: if (stream == NULL)
160: {
161: fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
162: exit (1);
163: }
164:
165: for (;;)
166: {
167: int n;
168:
169: lineno++;
170: n = getfield (stream, field0, ';');
171: n += getfield (stream, field1, ';');
172: n += getfield (stream, field2, ';');
173: n += getfield (stream, field3, ';');
174: n += getfield (stream, field4, ';');
175: n += getfield (stream, field5, ';');
176: n += getfield (stream, field6, ';');
177: n += getfield (stream, field7, ';');
178: n += getfield (stream, field8, ';');
179: n += getfield (stream, field9, ';');
180: n += getfield (stream, field10, ';');
181: n += getfield (stream, field11, ';');
182: n += getfield (stream, field12, ';');
183: n += getfield (stream, field13, ';');
184: n += getfield (stream, field14, '\n');
185: if (n == 0)
186: break;
187: if (n != 15)
188: {
189: fprintf (stderr, "short line in'%s':%d\n",
190: unicodedata_filename, lineno);
191: exit (1);
192: }
193: i = strtoul (field0, NULL, 16);
194: if (field1[0] == '<'
195: && strlen (field1) >= 9
196: && !strcmp (field1 + strlen(field1) - 8, ", First>"))
197: {
198:
199: lineno++;
200: n = getfield (stream, field0, ';');
201: n += getfield (stream, field1, ';');
202: n += getfield (stream, field2, ';');
203: n += getfield (stream, field3, ';');
204: n += getfield (stream, field4, ';');
205: n += getfield (stream, field5, ';');
206: n += getfield (stream, field6, ';');
207: n += getfield (stream, field7, ';');
208: n += getfield (stream, field8, ';');
209: n += getfield (stream, field9, ';');
210: n += getfield (stream, field10, ';');
211: n += getfield (stream, field11, ';');
212: n += getfield (stream, field12, ';');
213: n += getfield (stream, field13, ';');
214: n += getfield (stream, field14, '\n');
215: if (n != 15)
216: {
217: fprintf (stderr, "missing end range in '%s':%d\n",
218: unicodedata_filename, lineno);
219: exit (1);
220: }
221: if (!(field1[0] == '<'
222: && strlen (field1) >= 8
223: && !strcmp (field1 + strlen (field1) - 7, ", Last>")))
224: {
225: fprintf (stderr, "missing end range in '%s':%d\n",
226: unicodedata_filename, lineno);
227: exit (1);
228: }
229: field1[strlen (field1) - 7] = '\0';
230: j = strtoul (field0, NULL, 16);
231: for (; i <= j; i++)
232: fill_attribute (i, field1+1, field2, field3, field4, field5,
233: field6, field7, field8, field9, field10,
234: field11, field12, field13, field14);
235: }
236: else
237: {
238:
239: fill_attribute (i, field1, field2, field3, field4, field5,
240: field6, field7, field8, field9, field10,
241: field11, field12, field13, field14);
242: }
243: }
244: if (ferror (stream) || fclose (stream))
245: {
246: fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
247: exit (1);
248: }
249: }
250:
251:
252:
253: static unsigned int
254: to_upper (unsigned int ch)
255: {
256: if (unicode_attributes[ch].name != NULL
257: && unicode_attributes[ch].upper != NONE)
258: return unicode_attributes[ch].upper;
259: else
260: return ch;
261: }
262:
263: static unsigned int
264: to_lower (unsigned int ch)
265: {
266: if (unicode_attributes[ch].name != NULL
267: && unicode_attributes[ch].lower != NONE)
268: return unicode_attributes[ch].lower;
269: else
270: return ch;
271: }
272:
273: static unsigned int
274: to_title (unsigned int ch)
275: {
276: if (unicode_attributes[ch].name != NULL
277: && unicode_attributes[ch].title != NONE)
278: return unicode_attributes[ch].title;
279: else
280: return ch;
281: }
282:
283:
284:
285: static bool
286: is_upper (unsigned int ch)
287: {
288: return (to_lower (ch) != ch);
289: }
290:
291: static bool
292: is_lower (unsigned int ch)
293: {
294: return (to_upper (ch) != ch)
295:
296: || (ch == 0x00DF);
297: }
298:
299: static bool
300: is_alpha (unsigned int ch)
301: {
302: return (unicode_attributes[ch].name != NULL
303: && ((unicode_attributes[ch].category[0] == 'L'
304:
305:
306: && (ch != 0x0E2F) && (ch != 0x0E46))
307:
308:
309: || (ch == 0x0E31)
310: || (ch >= 0x0E34 && ch <= 0x0E3A)
311: || (ch >= 0x0E47 && ch <= 0x0E4E)
312:
313: || (ch == 0x0345)
314:
315: || (unicode_attributes[ch].category[0] == 'N'
316: && unicode_attributes[ch].category[1] == 'l')
317:
318: || (unicode_attributes[ch].category[0] == 'S'
319: && unicode_attributes[ch].category[1] == 'o'
320: && strstr (unicode_attributes[ch].name, " LETTER ")
321: != NULL)
322:
323:
324:
325: || (unicode_attributes[ch].category[0] == 'N'
326: && unicode_attributes[ch].category[1] == 'd'
327: && !(ch >= 0x0030 && ch <= 0x0039))));
328: }
329:
330: static bool
331: is_digit (unsigned int ch)
332: {
333: #if 0
334: return (unicode_attributes[ch].name != NULL
335: && unicode_attributes[ch].category[0] == 'N'
336: && unicode_attributes[ch].category[1] == 'd');
337:
338:
339: #else
340:
341:
342:
343:
344:
345:
346:
347:
348: return (ch >= 0x0030 && ch <= 0x0039);
349: #endif
350: }
351:
352: static bool
353: is_outdigit (unsigned int ch)
354: {
355: return (ch >= 0x0030 && ch <= 0x0039);
356: }
357:
358: static bool
359: is_blank (unsigned int ch)
360: {
361: return (ch == 0x0009
362:
363: || (unicode_attributes[ch].name != NULL
364: && unicode_attributes[ch].category[0] == 'Z'
365: && unicode_attributes[ch].category[1] == 's'
366: && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
367: }
368:
369: static bool
370: is_space (unsigned int ch)
371: {
372:
373:
374: return (ch == 0x0020
375: || ch == 0x000C
376: || ch == 0x000A
377: || ch == 0x000D
378: || ch == 0x0009
379: || ch == 0x000B
380:
381: || (unicode_attributes[ch].name != NULL
382: && unicode_attributes[ch].category[0] == 'Z'
383: && (unicode_attributes[ch].category[1] == 'l'
384: || unicode_attributes[ch].category[1] == 'p'
385: || (unicode_attributes[ch].category[1] == 's'
386: && !strstr (unicode_attributes[ch].decomposition,
387: "<noBreak>")))));
388: }
389:
390: static bool
391: is_cntrl (unsigned int ch)
392: {
393: return (unicode_attributes[ch].name != NULL
394: && (!strcmp (unicode_attributes[ch].name, "<control>")
395:
396: || (unicode_attributes[ch].category[0] == 'Z'
397: && (unicode_attributes[ch].category[1] == 'l'
398: || unicode_attributes[ch].category[1] == 'p'))));
399: }
400:
401: static bool
402: is_xdigit (unsigned int ch)
403: {
404: #if 0
405: return is_digit (ch)
406: || (ch >= 0x0041 && ch <= 0x0046)
407: || (ch >= 0x0061 && ch <= 0x0066);
408: #else
409:
410:
411:
412:
413:
414:
415:
416:
417: return (ch >= 0x0030 && ch <= 0x0039)
418: || (ch >= 0x0041 && ch <= 0x0046)
419: || (ch >= 0x0061 && ch <= 0x0066);
420: #endif
421: }
422:
423: static bool
424: is_graph (unsigned int ch)
425: {
426: return (unicode_attributes[ch].name != NULL
427: && strcmp (unicode_attributes[ch].name, "<control>")
428: && !is_space (ch));
429: }
430:
431: static bool
432: is_print (unsigned int ch)
433: {
434: return (unicode_attributes[ch].name != NULL
435: && strcmp (unicode_attributes[ch].name, "<control>")
436:
437: && !(unicode_attributes[ch].name != NULL
438: && unicode_attributes[ch].category[0] == 'Z'
439: && (unicode_attributes[ch].category[1] == 'l'
440: || unicode_attributes[ch].category[1] == 'p')));
441: }
442:
443: static bool
444: is_punct (unsigned int ch)
445: {
446: #if 0
447: return (unicode_attributes[ch].name != NULL
448: && unicode_attributes[ch].category[0] == 'P');
449: #else
450:
451:
452: return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
453: #endif
454: }
455:
456: static bool
457: is_combining (unsigned int ch)
458: {
459:
460:
461:
462:
463: return (unicode_attributes[ch].name != NULL
464: && unicode_attributes[ch].category[0] == 'M'
465: && (unicode_attributes[ch].category[1] == 'n'
466: || unicode_attributes[ch].category[1] == 'c'
467: || unicode_attributes[ch].category[1] == 'e'));
468: }
469:
470: static bool
471: is_combining_level3 (unsigned int ch)
472: {
473: return is_combining (ch)
474: && !(unicode_attributes[ch].combining[0] != '\0'
475: && unicode_attributes[ch].combining[0] != '0'
476: && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
477: }
478:
479:
480: static const char *
481: ucs_symbol (unsigned int i)
482: {
483: static char buf[11+1];
484:
485: sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
486: return buf;
487: }
488:
489:
490: static const char *
491: ucs_symbol_range (unsigned int low, unsigned int high)
492: {
493: static char buf[24+1];
494:
495: strcpy (buf, ucs_symbol (low));
496: strcat (buf, "..");
497: strcat (buf, ucs_symbol (high));
498: return buf;
499: }
500:
501:
502:
503: static void
504: output_charclass (FILE *stream, const char *classname,
505: bool (*func) (unsigned int))
506: {
507: char table[0x110000];
508: unsigned int i;