比这篇新的文章: Codee#2522
比这篇旧的文章: 一些工具模板函数

unicode各种编码格式之间的转换(utf8,utf16, utf32)

语言: C++, 标签: utf unicode 2009/06/28发布 8个月前更新
作者: doorfly, 点击812次, 评论(0), 收藏者(1), , 打分:

背景
主题: 字体:
001 //
002 //convertUTF.h
003 //
004
005
006 /*
007 * Copyright 2001-2004 Unicode, Inc.
008 *
009 * Disclaimer
010 *
011 */
012
013 /* ---------------------------------------------------------------------
014
015     Conversions between UTF32, UTF-16, and UTF-8.  Header file.
016
017     Several funtions are included here, forming a complete set of
018     conversions between the three formats.  UTF-7 is not included
019     here, but is handled in a separate source file.
020
021     Each of these routines takes pointers to input buffers and output
022     buffers.  The input buffers are const.
023
024     Each routine converts the text between *sourceStart and sourceEnd,
025     putting the result into the buffer between *targetStart and
026     targetEnd. Note: the end pointers are *after* the last item: e.g.
027     *(sourceEnd - 1) is the last item.
028
029     The return result indicates whether the conversion was successful,
030     and if not, whether the problem was in the source or target buffers.
031     (Only the first encountered problem is indicated.)
032
033     After the conversion, *sourceStart and *targetStart are both
034     updated to point to the end of last text successfully converted in
035     the respective buffers.
036
037     Input parameters:
038         sourceStart - pointer to a pointer to the source buffer.
039                 The contents of this are modified on return so that
040                 it points at the next thing to be converted.
041         targetStart - similarly, pointer to pointer to the target buffer.
042         sourceEnd, targetEnd - respectively pointers to the ends of the
043                 two buffers, for overflow checking only.
044
045     These conversion functions take a ConversionFlags argument. When this
046     flag is set to strict, both irregular sequences and isolated surrogates
047     will cause an error.  When the flag is set to lenient, both irregular
048     sequences and isolated surrogates are converted.
049
050     Whether the flag is strict or lenient, all illegal sequences will cause
051     an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
052     or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
053     must check for illegal sequences.
054
055     When the flag is set to lenient, characters over 0x10FFFF are converted
056     to the replacement character; otherwise (when the flag is set to strict)
057     they constitute an error.
058
059     Output parameters:
060         The value "sourceIllegal" is returned from some routines if the input
061         sequence is malformed.  When "sourceIllegal" is returned, the source
062         value will point to the illegal value that caused the problem. E.g.,
063         in UTF-8 when a sequence is malformed, it points to the start of the
064         malformed sequence. 
065
066     Author: Mark E. Davis, 1994.
067     Rev History: Rick McGowan, fixes & updates May 2001.
068                  Fixes & updates, Sept 2001.
069
070 ------------------------------------------------------------------------ */
071
072 /* ---------------------------------------------------------------------
073     The following 4 definitions are compiler-specific.
074     The C standard does not guarantee that wchar_t has at least
075     16 bits, so wchar_t is no less portable than unsigned short!
076     All should be unsigned values to avoid sign extension during
077     bit mask & shift operations.
078 ------------------------------------------------------------------------ */
079
080 typedef unsigned long UTF32;  /* at least 32 bits */
081 typedef unsigned short UTF16;  /* at least 16 bits */
082 typedef unsigned char UTF8;   /* typically 8 bits */
083 typedef unsigned char Boolean; /* 0 or 1 */
084
085 /* Some fundamental constants */
086 #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
087 #define UNI_MAX_BMP (UTF32)0x0000FFFF
088 #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
089 #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
090 #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
091
092 typedef enum
093 {
094     conversionOK,
095     /* conversion successful */
096     sourceExhausted,
097     /* partial character in source, but hit end */
098     targetExhausted,
099     /* insuff. room in target for conversion */
100     sourceIllegal           /* source sequence is illegal/malformed */
101 } ConversionResult;
102
103 typedef enum
104 {
105     strictConversion    = 0,
106     lenientConversion
107 } ConversionFlags;
108
109 /* This is for C++ and does no harm in C */
110 #ifdef __cplusplus
111 extern "C"
112 {
113 #endif
114
115     ConversionResult ConvertUTF8toUTF16(const UTF8** sourceStart,
116         const UTF8* sourceEnd, UTF16** targetStart, UTF16* targetEnd,
117         ConversionFlags flags);
118
119     ConversionResult ConvertUTF16toUTF8(const UTF16** sourceStart,
120         const UTF16* sourceEnd, UTF8** targetStart, UTF8* targetEnd,
121         ConversionFlags flags);
122
123     ConversionResult ConvertUTF8toUTF32(const UTF8** sourceStart,
124         const UTF8* sourceEnd, UTF32** targetStart, UTF32* targetEnd,
125         ConversionFlags flags);
126
127     ConversionResult ConvertUTF32toUTF8(const UTF32** sourceStart,
128         const UTF32* sourceEnd, UTF8** targetStart, UTF8* targetEnd,
129         ConversionFlags flags);
130
131     ConversionResult ConvertUTF16toUTF32(const UTF16** sourceStart,
132         const UTF16* sourceEnd, UTF32** targetStart, UTF32* targetEnd,
133         ConversionFlags flags);
134
135     ConversionResult ConvertUTF32toUTF16(const UTF32** sourceStart,
136         const UTF32* sourceEnd, UTF16** targetStart, UTF16* targetEnd,
137         ConversionFlags flags);
138
139     Boolean isLegalUTF8Sequence(const UTF8* source, const UTF8* sourceEnd);
140
141 #ifdef __cplusplus
142 }
143 #endif
144
145
146
147
148
149 ////////////////////////////////////////////////////////////////////////////////////////////////
150 //
151 //convertUTF.c
152 //
153
154 #include "ConvertUTF.h"
155 #ifdef CVTUTF_DEBUG
156 #include <stdio.h>
157 #endif
158
159 static const int halfShift = 10; /* used for shifting by 10 bits */
160
161 static const UTF32 halfBase = 0x0010000UL;
162 static const UTF32 halfMask = 0x3FFUL;
163
164 #define UNI_SUR_HIGH_START  (UTF32)0xD800
165 #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
166 #define UNI_SUR_LOW_START   (UTF32)0xDC00
167 #define UNI_SUR_LOW_END     (UTF32)0xDFFF
168 #define false      0
169 #define true        1
170
171 /* --------------------------------------------------------------------- */
172
173 ConversionResult ConvertUTF32toUTF16(const UTF32** sourceStart,
174     const UTF32* sourceEnd, UTF16** targetStart, UTF16* targetEnd,
175     ConversionFlags flags)
176 {
177     ConversionResult result = conversionOK;
178     const UTF32* source = *sourceStart;
179     UTF16* target = *targetStart;
180     while (source < sourceEnd)
181     {
182         UTF32 ch;
183         if (target >= targetEnd)
184         {
185             result = targetExhausted; break;
186         }
187         ch = *source++;
188         if (ch <= UNI_MAX_BMP)
189         {
190             /* Target is a character <= 0xFFFF */
191             /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
192             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
193             {
194                 if (flags == strictConversion)
195                 {
196                     --source; /* return to the illegal value itself */
197                     result = sourceIllegal;
198                     break;
199                 }
200                 else
201                 {
202                     *target++ = UNI_REPLACEMENT_CHAR;
203                 }
204             }
205             else
206             {
207                 *target++ = (UTF16) ch; /* normal case */
208             }
209         }
210         else if (ch > UNI_MAX_LEGAL_UTF32)
211         {
212             if (flags == strictConversion)
213             {
214                 result = sourceIllegal;
215             }
216             else
217             {
218                 *target++ = UNI_REPLACEMENT_CHAR;
219             }
220         }
221         else
222         {
223             /* target is a character in range 0xFFFF - 0x10FFFF. */
224             if (target + 1 >= targetEnd)
225             {
226                 --source; /* Back up source pointer! */
227                 result = targetExhausted; break;
228             }
229             ch -= halfBase;
230             *target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
231             *target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
232         }
233     }
234     *sourceStart = source;
235     *targetStart = target;
236     return result;
237 }
238
239 /* --------------------------------------------------------------------- */
240
241 ConversionResult ConvertUTF16toUTF32(const UTF16** sourceStart,
242     const UTF16* sourceEnd, UTF32** targetStart, UTF32* targetEnd,
243     ConversionFlags flags)
244 {
245     ConversionResult result = conversionOK;
246     const UTF16* source = *sourceStart;
247     UTF32* target = *targetStart;
248     UTF32 ch, ch2;
249     while (source < sourceEnd)
250     {
251         const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
252         ch = *source++;
253         /* If we have a surrogate pair, convert to UTF32 first. */
254         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
255         {
256             /* If the 16 bits following the high surrogate are in the source buffer... */
257             if (source < sourceEnd)
258             {
259                 ch2 = *source;
260                 /* If it's a low surrogate, convert to UTF32. */
261                 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
262                 {
263                     ch = ((ch - UNI_SUR_HIGH_START) << halfShift) +
264                         (ch2 - UNI_SUR_LOW_START) +
265                         halfBase;
266                     ++source;
267                 }
268                 else if (flags == strictConversion)
269                 {
270                     /* it's an unpaired high surrogate */
271                     --source; /* return to the illegal value itself */
272                     result = sourceIllegal;
273                     break;
274                 }
275             }
276             else
277             {
278                 /* We don't have the 16 bits following the high surrogate. */
279                 --source; /* return to the high surrogate */
280                 result = sourceExhausted;
281                 break;
282             }
283         }
284         else if (flags == strictConversion)
285         {
286             /* UTF-16 surrogate values are illegal in UTF-32 */
287             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
288             {
289                 --source; /* return to the illegal value itself */
290                 result = sourceIllegal;
291                 break;
292             }
293         }
294         if (target >= targetEnd)
295         {
296             source = oldSource; /* Back up source pointer! */
297             result = targetExhausted; break;
298         }
299         *target++ = ch;
300     }
301     *sourceStart = source;
302     *targetStart = target;
303 #ifdef CVTUTF_DEBUG
304     if (result == sourceIllegal)
305     {
306         fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch,
307             ch2);
308         fflush(stderr);
309     }
310 #endif
311     return result;
312 }
313
314 /* --------------------------------------------------------------------- */
315
316 /*
317 * Index into the table below with the first byte of a UTF-8 sequence to
318 * get the number of trailing bytes that are supposed to follow it.
319 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
320 * left as-is for anyone who may want to do such conversion, which was
321 * allowed in earlier algorithms.
322 */
323 static const char trailingBytesForUTF8[256] =
324 {
325     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
326     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
327     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
328     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
329     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
330     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
331     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
332     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
333     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
334     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4,
335     4, 4, 5, 5, 5, 5
336 };
337
338 /*
339 * Magic values subtracted from a buffer value during UTF8 conversion.
340 * This table contains as many values as there might be trailing bytes
341 * in a UTF-8 sequence.
342 */
343 static const UTF32 offsetsFromUTF8[6] =
344 {
345     0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, 0xFA082080UL,
346     0x82082080UL
347 };
348
349 /*
350 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
351 * into the first byte, depending on how many bytes follow.  There are
352 * as many entries in this table as there are UTF-8 sequence types.
353 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
354 * for *legal* UTF-8 will be 4 or fewer bytes total.
355 */
356 static const UTF8 firstByteMark[7] =
357 {
358     0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
359 };
360
361 /* --------------------------------------------------------------------- */
362
363 /* The interface converts a whole buffer to avoid function-call overhead.
364 * Constants have been gathered. Loops & conditionals have been removed as
365 * much as possible for efficiency, in favor of drop-through switches.
366 * (See "Note A" at the bottom of the file for equivalent code.)
367 * If your compiler supports it, the "isLegalUTF8" call can be turned
368 * into an inline function.
369 */
370
371 /* --------------------------------------------------------------------- */
372
373 ConversionResult ConvertUTF16toUTF8(const UTF16** sourceStart,
374     const UTF16* sourceEnd, UTF8** targetStart, UTF8* targetEnd,
375     ConversionFlags flags)
376 {
377     ConversionResult result = conversionOK;
378     const UTF16* source = *sourceStart;
379     UTF8* target = *targetStart;
380     while (source < sourceEnd)
381     {
382         UTF32 ch;
383         unsigned short bytesToWrite = 0;
384         const UTF32 byteMask = 0xBF;
385         const UTF32 byteMark = 0x80;
386         const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
387         ch = *source++;
388         /* If we have a surrogate pair, convert to UTF32 first. */
389         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
390         {
391             /* If the 16 bits following the high surrogate are in the source buffer... */
392             if (source < sourceEnd)
393             {
394                 UTF32 ch2 = *source;
395                 /* If it's a low surrogate, convert to UTF32. */
396                 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
397                 {
398                     ch = ((ch - UNI_SUR_HIGH_START) << halfShift) +
399                         (ch2 - UNI_SUR_LOW_START) +
400                         halfBase;
401                     ++source;
402                 }
403                 else if (flags == strictConversion)
404                 {
405                     /* it's an unpaired high surrogate */
406                     --source; /* return to the illegal value itself */
407                     result = sourceIllegal;
408                     break;
409                 }
410             }
411             else
412             {
413                 /* We don't have the 16 bits following the high surrogate. */
414                 --source; /* return to the high surrogate */
415                 result = sourceExhausted;
416                 break;
417             }
418         }
419         else if (flags == strictConversion)
420         {
421             /* UTF-16 surrogate values are illegal in UTF-32 */
422             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
423             {
424                 --source; /* return to the illegal value itself */
425                 result = sourceIllegal;
426                 break;
427             }
428         }
429         /* Figure out how many bytes the result will require */
430         if (ch < (UTF32) 0x80)
431         {
432             bytesToWrite = 1;
433         }
434         else if (ch < (UTF32) 0x800)
435         {
436             bytesToWrite = 2;
437         }
438         else if (ch < (UTF32) 0x10000)
439         {
440             bytesToWrite = 3;
441         }
442         else if (ch < (UTF32) 0x110000)
443         {
444             bytesToWrite = 4;
445         }
446         else
447         {
448             bytesToWrite = 3;
449             ch = UNI_REPLACEMENT_CHAR;
450         }
451
452         target += bytesToWrite;
453         if (target > targetEnd)
454         {
455             source = oldSource; /* Back up source pointer! */
456             target -= bytesToWrite; result = targetExhausted; break;
457         }
458         switch (bytesToWrite)
459         {
460             /* note: everything falls through. */
461         case 4:
462             *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
463         case 3:
464             *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
465         case 2:
466             *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
467         case 1:
468             *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
469         }
470         target += bytesToWrite;
471     }
472     *sourceStart = source;
473     *targetStart = target;
474     return result;
475 }
476
477 /* --------------------------------------------------------------------- */
478
479 /*
480 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
481 * This must be called with the length pre-determined by the first byte.
482 * If not calling this from ConvertUTF8to*, then the length can be set by:
483 *  length = trailingBytesForUTF8[*source]+1;
484 * and the sequence is illegal right away if there aren't that many bytes
485 * available.
486 * If presented with a length > 4, this returns false.  The Unicode
487 * definition of UTF-8 goes up to 4-byte sequences.
488 */
489
490 static Boolean isLegalUTF8(const UTF8* source, int length)
491 {
492     UTF8 a;
493     const UTF8* srcptr = source + length;
494     switch (length)
495     {
496     default:
497         return false;
498         /* Everything else falls through when "true"... */
499     case 4:
500         if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
501             return false;
502     case 3:
503         if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
504             return false;
505     case 2:
506         if ((a = (*--srcptr)) > 0xBF)
507             return false;
508
509         switch (*source)
510         {
511             /* no fall-through in this inner switch */
512         case 0xE0:
513             if (a < 0xA0)
514                 return false; break;
515         case 0xED:
516             if (a > 0x9F)
517                 return false; break;
518         case 0xF0:
519             if (a < 0x90)
520                 return false; break;
521         case 0xF4:
522             if (a > 0x8F)
523                 return false; break;
524         default:
525             if (a < 0x80)
526                 return false;
527         }
528
529     case 1:
530         if (*source >= 0x80 && *source < 0xC2)
531             return false;
532     }
533     if (*source > 0xF4)
534         return false;
535     return true;
536 }
537
538 /* --------------------------------------------------------------------- */
539
540 /*
541 * Exported function to return whether a UTF-8 sequence is legal or not.
542 * This is not used here; it's just exported.
543 */
544 Boolean isLegalUTF8Sequence(const UTF8* source, const UTF8* sourceEnd)
545 {
546     int length = trailingBytesForUTF8[*source] + 1;
547     if (source + length > sourceEnd)
548     {
549         return false;
550     }
551     return isLegalUTF8(source, length);
552 }
553
554 /* --------------------------------------------------------------------- */
555
556 ConversionResult ConvertUTF8toUTF16(const UTF8** sourceStart,
557     const UTF8* sourceEnd, UTF16** targetStart, UTF16* targetEnd,
558     ConversionFlags flags)
559 {
560     ConversionResult result = conversionOK;
561     const UTF8* source = *sourceStart;
562     UTF16* target = *targetStart;
563     while (source < sourceEnd)
564     {
565         UTF32 ch = 0;
566         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
567         if (source + extraBytesToRead >= sourceEnd)
568         {
569             result = sourceExhausted; break;
570         }
571         /* Do this check whether lenient or strict */
572         if (!isLegalUTF8(source, extraBytesToRead + 1))
573         {
574             result = sourceIllegal;
575             break;
576         }
577         /*
578                      * The cases all fall through. See "Note A" below.
579                      */
580         switch (extraBytesToRead)
581         {
582         case 5:
583             ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
584         case 4:
585             ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
586         case 3:
587             ch += *source++; ch <<= 6;
588         case 2:
589             ch += *source++; ch <<= 6;
590         case 1:
591             ch += *source++; ch <<= 6;
592         case 0:
593             ch += *source++;
594         }
595         ch -= offsetsFromUTF8[extraBytesToRead];
596
597         if (target >= targetEnd)
598         {
599             source -= (extraBytesToRead + 1); /* Back up source pointer! */
600             result = targetExhausted; break;
601         }
602         if (ch <= UNI_MAX_BMP)
603         {
604             /* Target is a character <= 0xFFFF */
605             /* UTF-16 surrogate values are illegal in UTF-32 */
606             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
607             {
608                 if (flags == strictConversion)
609                 {
610                     source -= (extraBytesToRead + 1); /* return to the illegal value itself */
611                     result = sourceIllegal;
612                     break;
613                 }
614                 else
615                 {
616                     *target++ = UNI_REPLACEMENT_CHAR;
617                 }
618             }
619             else
620             {
621                 *target++ = (UTF16) ch; /* normal case */
622             }
623         }
624         else if (ch > UNI_MAX_UTF16)
625         {
626             if (flags == strictConversion)
627             {
628                 result = sourceIllegal;
629                 source -= (extraBytesToRead + 1); /* return to the start */
630                 break; /* Bail out; shouldn't continue */
631             }
632             else
633             {
634                 *target++ = UNI_REPLACEMENT_CHAR;
635             }
636         }
637         else
638         {
639             /* target is a character in range 0xFFFF - 0x10FFFF. */
640             if (target + 1 >= targetEnd)
641             {
642                 source -= (extraBytesToRead + 1); /* Back up source pointer! */
643                 result = targetExhausted; break;
644             }
645             ch -= halfBase;
646             *target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
647             *target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
648         }
649     }
650     *sourceStart = source;
651     *targetStart = target;
652     return result;
653 }
654
655 /* --------------------------------------------------------------------- */
656
657 ConversionResult ConvertUTF32toUTF8(const UTF32** sourceStart,
658     const UTF32* sourceEnd, UTF8** targetStart, UTF8* targetEnd,
659     ConversionFlags flags)
660 {
661     ConversionResult result = conversionOK;
662     const UTF32* source = *sourceStart;
663     UTF8* target = *targetStart;
664     while (source < sourceEnd)
665     {
666         UTF32 ch;
667         unsigned short bytesToWrite = 0;
668         const UTF32 byteMask = 0xBF;
669         const UTF32 byteMark = 0x80;
670         ch = *source++;
671         if (flags == strictConversion)
672         {
673             /* UTF-16 surrogate values are illegal in UTF-32 */
674             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
675             {
676                 --source; /* return to the illegal value itself */
677                 result = sourceIllegal;
678                 break;
679             }
680         }
681         /*
682                  * Figure out how many bytes the result will require. Turn any
683                  * illegally large UTF32 things (> Plane 17) into replacement chars.
684                  */
685         if (ch < (UTF32) 0x80)
686         {
687             bytesToWrite = 1;
688         }
689         else if (ch < (UTF32) 0x800)
690         {
691             bytesToWrite = 2;
692         }
693         else if (ch < (UTF32) 0x10000)
694         {
695             bytesToWrite = 3;
696         }
697         else if (ch <= UNI_MAX_LEGAL_UTF32)
698         {
699             bytesToWrite = 4;
700         }
701         else
702         {
703             bytesToWrite = 3;
704             ch = UNI_REPLACEMENT_CHAR;
705             result = sourceIllegal;
706         }
707
708         target += bytesToWrite;
709         if (target > targetEnd)
710         {
711             --source; /* Back up source pointer! */
712             target -= bytesToWrite; result = targetExhausted; break;
713         }
714         switch (bytesToWrite)
715         {
716             /* note: everything falls through. */
717         case 4:
718             *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
719         case 3:
720             *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
721         case 2:
722             *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
723         case 1:
724             *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
725         }
726         target += bytesToWrite;
727     }
728     *sourceStart = source;
729     *targetStart = target;
730     return result;
731 }
732
733 /* --------------------------------------------------------------------- */
734
735 ConversionResult ConvertUTF8toUTF32(const UTF8** sourceStart,
736     const UTF8* sourceEnd, UTF32** targetStart, UTF32* targetEnd,
737     ConversionFlags flags)
738 {
739     ConversionResult result = conversionOK;
740     const UTF8* source = *sourceStart;
741     UTF32* target = *targetStart;
742     while (source < sourceEnd)
743     {
744         UTF32 ch = 0;
745         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
746         if (source + extraBytesToRead >= sourceEnd)
747         {
748             result = sourceExhausted; break;
749         }
750         /* Do this check whether lenient or strict */
751         if (!isLegalUTF8(source, extraBytesToRead + 1))
752         {
753             result = sourceIllegal;
754             break;
755         }
756         /*
757                      * The cases all fall through. See "Note A" below.
758                      */
759         switch (extraBytesToRead)
760         {
761         case 5:
762             ch += *source++; ch <<= 6;
763         case 4:
764             ch += *source++; ch <<= 6;
765         case 3:
766             ch += *source++; ch <<= 6;
767         case 2:
768             ch += *source++; ch <<= 6;
769         case 1:
770             ch += *source++; ch <<= 6;
771         case 0:
772             ch += *source++;
773         }
774         ch -= offsetsFromUTF8[extraBytesToRead];
775
776         if (target >= targetEnd)
777         {
778             source -= (extraBytesToRead + 1); /* Back up the source pointer! */
779             result = targetExhausted; break;
780         }
781         if (ch <= UNI_MAX_LEGAL_UTF32)
782         {
783             /*
784                                      * UTF-16 surrogate values are illegal in UTF-32, and anything
785                                      * over Plane 17 (> 0x10FFFF) is illegal.
786                                      */
787             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
788             {
789                 if (flags == strictConversion)
790                 {
791                     source -= (extraBytesToRead + 1); /* return to the illegal value itself */
792                     result = sourceIllegal;
793                     break;
794                 }
795                 else
796                 {
797                     *target++ = UNI_REPLACEMENT_CHAR;
798                 }
799             }
800             else
801             {
802                 *target++ = ch;
803             }
804         }
805         else
806         {
807             /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
808             result = sourceIllegal;
809             *target++ = UNI_REPLACEMENT_CHAR;
810         }
811     }
812     *sourceStart = source;
813     *targetStart = target;
814     return result;
815 }


所有评论,共0条:( 我也来说两句)


发表评论

注册登录后再发表评论