4006 Certain printable unicode characters misclassified as nonprintable
5227 space should be automatically included in 'print' class by localedef

   1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2010,2011 Nexenta Systems, Inc.  All rights reserved.
  14  * Copyright 2012 Garrett D'Amore <garrett@damore.org>
  15  * Copyright 2013 DEY Storage Systems, Inc.
  16  */
  17 
  18 /*
  19  * LC_CTYPE database generation routines for localedef.
  20  */
  21 
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #include <sys/types.h>
  26 #include <sys/avl.h>
  27 #include <wchar.h>
  28 #include <ctype.h>
  29 #include <wctype.h>
  30 #include <unistd.h>
  31 #include "_ctype.h"
  32 #include "localedef.h"
  33 #include "parser.tab.h"
  34 #include "runefile.h"
  35 
  36 static avl_tree_t       ctypes;
  37 
  38 static wchar_t          last_ctype;
  39 
  40 typedef struct ctype_node {
  41         wchar_t wc;
  42         int32_t ctype;
  43         int32_t toupper;
  44         int32_t tolower;
  45         avl_node_t avl;
  46 } ctype_node_t;
  47 
  48 typedef struct width_node {
  49         wchar_t start;
  50         wchar_t end;
  51         int8_t width;
  52         avl_node_t avl;
  53 } width_node_t;
  54 
  55 static int
  56 ctype_compare(const void *n1, const void *n2)
  57 {
  58         const ctype_node_t *c1 = n1;
  59         const ctype_node_t *c2 = n2;
  60 
  61         return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0);
  62 }
  63 
  64 void
  65 init_ctype(void)
  66 {
  67         avl_create(&ctypes, ctype_compare, sizeof (ctype_node_t),
  68             offsetof(ctype_node_t, avl));
  69 }
  70 
  71 
  72 static void
  73 add_ctype_impl(ctype_node_t *ctn)
  74 {
  75         switch (last_kw) {
  76         case T_ISUPPER:
  77                 ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT);
  78                 break;
  79         case T_ISLOWER:
  80                 ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT);
  81                 break;
  82         case T_ISALPHA:
  83                 ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT);
  84                 break;
  85         case T_ISDIGIT:
  86                 ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT);
  87                 break;
  88         case T_ISSPACE:
  89                 ctn->ctype |= _ISSPACE;
  90                 break;
  91         case T_ISCNTRL:
  92                 ctn->ctype |= _ISCNTRL;
  93                 break;
  94         case T_ISGRAPH:
  95                 ctn->ctype |= (_ISGRAPH | _ISPRINT);
  96                 break;
  97         case T_ISPRINT:
  98                 ctn->ctype |= _ISPRINT;
  99                 break;
 100         case T_ISPUNCT:
 101                 ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT);
 102                 break;
 103         case T_ISXDIGIT:
 104                 ctn->ctype |= (_ISXDIGIT | _ISPRINT);
 105                 break;
 106         case T_ISBLANK:
 107                 ctn->ctype |= (_ISBLANK | _ISSPACE);
 108                 break;
 109         case T_ISPHONOGRAM:
 110                 ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH);
 111                 break;
 112         case T_ISIDEOGRAM:
 113                 ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH);
 114                 break;
 115         case T_ISENGLISH:
 116                 ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH);
 117                 break;
 118         case T_ISNUMBER:
 119                 ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH);
 120                 break;
 121         case T_ISSPECIAL:
 122                 ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH);
 123                 break;
 124         case T_ISALNUM:
 125                 /*
 126                  * We can't do anything with this.  The character
 127                  * should already be specified as a digit or alpha.
 128                  */
 129                 break;
 130         default:
 131                 errf(_("not a valid character class"));
 132         }
 133 }
 134 
 135 static ctype_node_t *
 136 get_ctype(wchar_t wc)
 137 {
 138         ctype_node_t    srch;
 139         ctype_node_t    *ctn;
 140         avl_index_t     where;
 141 
 142         srch.wc = wc;
 143         if ((ctn = avl_find(&ctypes, &srch, &where)) == NULL) {
 144                 if ((ctn = calloc(1, sizeof (*ctn))) == NULL) {
 145                         errf(_("out of memory"));
 146                         return (NULL);
 147                 }
 148                 ctn->wc = wc;
 149 
 150                 avl_insert(&ctypes, ctn, where);
 151         }
 152         return (ctn);
 153 }
 154 
 155 void
 156 add_ctype(int val)
 157 {
 158         ctype_node_t    *ctn;
 159 
 160         if ((ctn = get_ctype(val)) == NULL) {
 161                 INTERR;
 162                 return;
 163         }
 164         add_ctype_impl(ctn);
 165         last_ctype = ctn->wc;
 166 }
 167 
 168 void
 169 add_ctype_range(int end)
 170 {
 171         ctype_node_t    *ctn;
 172         wchar_t         cur;
 173 
 174         if (end < last_ctype) {
 175                 errf(_("malformed character range (%u ... %u))"),
 176                     last_ctype, end);
 177                 return;
 178         }
 179         for (cur = last_ctype + 1; cur <= end; cur++) {
 180                 if ((ctn = get_ctype(cur)) == NULL) {
 181                         INTERR;
 182                         return;
 183                 }
 184                 add_ctype_impl(ctn);
 185         }
 186         last_ctype = end;
 187 
 188 }
 189 
 190 /*
 191  * A word about widths: if the width mask is specified, then libc
 192  * unconditionally honors it.  Otherwise, it assumes printable
 193  * characters have width 1, and non-printable characters have width
 194  * -1 (except for NULL which is special with with 0).  Hence, we have
 195  * no need to inject defaults here -- the "default" unset value of 0
 196  * indicates that libc should use its own logic in wcwidth as described.
 197  */
 198 void
 199 add_width(int wc, int width)
 200 {
 201         ctype_node_t    *ctn;
 202 
 203         if ((ctn = get_ctype(wc)) == NULL) {
 204                 INTERR;
 205                 return;
 206         }
 207         ctn->ctype &= ~(_CTYPE_SWM);
 208         switch (width) {
 209         case 0:
 210                 ctn->ctype |= _CTYPE_SW0;
 211                 break;
 212         case 1:
 213                 ctn->ctype |= _CTYPE_SW1;
 214                 break;
 215         case 2:
 216                 ctn->ctype |= _CTYPE_SW2;
 217                 break;
 218         case 3:
 219                 ctn->ctype |= _CTYPE_SW3;
 220                 break;
 221         }
 222 }
 223 
 224 void
 225 add_width_range(int start, int end, int width)
 226 {
 227         for (; start <= end; start++) {
 228                 add_width(start, width);
 229         }
 230 }
 231 
 232 void
 233 add_caseconv(int val, int wc)
 234 {
 235         ctype_node_t    *ctn;
 236 
 237         ctn = get_ctype(val);
 238         if (ctn == NULL) {
 239                 INTERR;
 240                 return;
 241         }
 242 
 243         switch (last_kw) {
 244         case T_TOUPPER:
 245                 ctn->toupper = wc;
 246                 break;
 247         case T_TOLOWER:
 248                 ctn->tolower = wc;
 249                 break;
 250         default:
 251                 INTERR;
 252                 break;
 253         }
 254 }
 255 
 256 void
 257 dump_ctype(void)
 258 {
 259         FILE            *f;
 260         _FileRuneLocale rl;
 261         ctype_node_t    *ctn, *last_ct, *last_lo, *last_up;
 262         _FileRuneEntry  *ct = NULL;
 263         _FileRuneEntry  *lo = NULL;
 264         _FileRuneEntry  *up = NULL;
 265         wchar_t         wc;
 266 
 267         (void) memset(&rl, 0, sizeof (rl));
 268         last_ct = NULL;
 269         last_lo = NULL;
 270         last_up = NULL;
 271 
 272         if ((f = open_category()) == NULL)
 273                 return;
 274 
 275         (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8);
 276         (void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding));
 277 
 278         /*
 279          * Initialize the identity map.
 280          */
 281         for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) {
 282                 rl.maplower[wc] = wc;
 283                 rl.mapupper[wc] = wc;
 284         }
 285 
 286         for (ctn = avl_first(&ctypes); ctn; ctn = AVL_NEXT(&ctypes, ctn)) {
 287                 int conflict = 0;
 288 
 289 
 290                 wc = ctn->wc;
 291 
 292                 /*
 293                  * POSIX requires certain portable characters have
 294                  * certain types.  Add them if they are missing.
 295                  */
 296                 if ((wc >= 1) && (wc <= 127)) {
 297                         if ((wc >= 'A') && (wc <= 'Z'))
 298                                 ctn->ctype |= _ISUPPER;
 299                         if ((wc >= 'a') && (wc <= 'z'))
 300                                 ctn->ctype |= _ISLOWER;
 301                         if ((wc >= '0') && (wc <= '9'))
 302                                 ctn->ctype |= _ISDIGIT;


 303                         if (strchr(" \f\n\r\t\v", (char)wc) != NULL)
 304                                 ctn->ctype |= _ISSPACE;
 305                         if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL)
 306                                 ctn->ctype |= _ISXDIGIT;
 307                         if (strchr(" \t", (char)wc))
 308                                 ctn->ctype |= _ISBLANK;
 309 
 310                         /*
 311                          * Technically these settings are only
 312                          * required for the C locale.  However, it
 313                          * turns out that because of the historical
 314                          * version of isprint(), we need them for all
 315                          * locales as well.  Note that these are not
 316                          * necessarily valid punctation characters in
 317                          * the current language, but ispunct() needs
 318                          * to return TRUE for them.
 319                          */
 320                         if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
 321                             (char)wc))
 322                                 ctn->ctype |= _ISPUNCT;
 323                 }
 324 
 325                 /*
 326                  * POSIX also requires that certain types imply
 327                  * others.  Add any inferred types here.
 328                  */
 329                 if (ctn->ctype & (_ISUPPER |_ISLOWER))
 330                         ctn->ctype |= _ISALPHA;
 331                 if (ctn->ctype & _ISDIGIT)
 332                         ctn->ctype |= _ISXDIGIT;
 333                 if (ctn->ctype & _ISBLANK)
 334                         ctn->ctype |= _ISSPACE;
 335                 if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT))
 336                         ctn->ctype |= _ISGRAPH;
 337                 if (ctn->ctype & _ISGRAPH)
 338                         ctn->ctype |= _ISPRINT;
 339 
 340                 /*
 341                  * Finally, POSIX requires that certain combinations
 342                  * are invalid.  We don't flag this as a fatal error,
 343                  * but we will warn about.
 344                  */
 345                 if ((ctn->ctype & _ISALPHA) &&
 346                     (ctn->ctype & (_ISPUNCT|_ISDIGIT)))
 347                         conflict++;
 348                 if ((ctn->ctype & _ISPUNCT) &
 349                     (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT)))
 350                         conflict++;
 351                 if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH))
 352                         conflict++;
 353                 if ((ctn->ctype & _ISCNTRL) & _ISPRINT)
 354                         conflict++;
 355                 if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH)))
 356                         conflict++;
 357 
 358                 if (conflict) {
 359                         warn("conflicting classes for character 0x%x (%x)",
 360                             wc, ctn->ctype);
 361                 }
 362                 /*
 363                  * Handle the lower 256 characters using the simple
 364                  * optimization.  Note that if we have not defined the
 365                  * upper/lower case, then we identity map it.
 366                  */
 367                 if ((unsigned)wc < _CACHED_RUNES) {
 368                         rl.runetype[wc] = ctn->ctype;
 369                         if (ctn->tolower)
 370                                 rl.maplower[wc] = ctn->tolower;
 371                         if (ctn->toupper)
 372                                 rl.mapupper[wc] = ctn->toupper;
 373                         continue;
 374                 }
 375 
 376                 if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype)) {
 377                         ct[rl.runetype_ext_nranges-1].max = wc;
 378                         last_ct = ctn;
 379                 } else {
 380                         rl.runetype_ext_nranges++;
 381                         ct = realloc(ct,
 382                             sizeof (*ct) * rl.runetype_ext_nranges);
 383                         ct[rl.runetype_ext_nranges - 1].min = wc;
 384                         ct[rl.runetype_ext_nranges - 1].max = wc;
 385                         ct[rl.runetype_ext_nranges - 1].map = ctn->ctype;
 386                         last_ct = ctn;
 387                 }
 388                 if (ctn->tolower == 0) {
 389                         last_lo = NULL;
 390                 } else if ((last_lo != NULL) &&
 391                     (last_lo->tolower + 1 == ctn->tolower)) {
 392                         lo[rl.maplower_ext_nranges-1].max = wc;
 393                         last_lo = ctn;
 394                 } else {
 395                         rl.maplower_ext_nranges++;
 396                         lo = realloc(lo,
 397                             sizeof (*lo) * rl.maplower_ext_nranges);
 398                         lo[rl.maplower_ext_nranges - 1].min = wc;
 399                         lo[rl.maplower_ext_nranges - 1].max = wc;
 400                         lo[rl.maplower_ext_nranges - 1].map = ctn->tolower;
 401                         last_lo = ctn;
 402                 }
 403 
 404                 if (ctn->toupper == 0) {
 405                         last_up = NULL;
 406                 } else if ((last_up != NULL) &&
 407                     (last_up->toupper + 1 == ctn->toupper)) {
 408                         up[rl.mapupper_ext_nranges-1].max = wc;
 409                         last_up = ctn;
 410                 } else {
 411                         rl.mapupper_ext_nranges++;
 412                         up = realloc(up,
 413                             sizeof (*up) * rl.mapupper_ext_nranges);
 414                         up[rl.mapupper_ext_nranges - 1].min = wc;
 415                         up[rl.mapupper_ext_nranges - 1].max = wc;
 416                         up[rl.mapupper_ext_nranges - 1].map = ctn->toupper;
 417                         last_up = ctn;
 418                 }
 419         }
 420 
 421         if ((wr_category(&rl, sizeof (rl), f) < 0) ||
 422             (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) ||
 423             (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) ||
 424             (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) {
 425                 return;
 426         }
 427 
 428         close_category(f);
 429 }
--- EOF ---