比这篇新的文章:
Codee#2522
比这篇旧的文章: 一些工具模板函数
作者: doorfly, 点击812次, 评论(0), 收藏者(1), , 打分:
所有评论,共0条:( 我也来说两句)
比这篇旧的文章: 一些工具模板函数
unicode各种编码格式之间的转换(utf8,utf16, utf32)
语言: C++, 标签: utf unicode 2009/06/28发布 8个月前更新作者: doorfly, 点击812次, 评论(0), 收藏者(1), , 打分:
001 //
002 //convertUTF.h
003 //
004
005
006 /*
007 * Copyright 2001-2004 Unicode, Inc.
008 *
009 * Disclaimer
010 *
011 */
012
013 /* ---------------------------------------------------------------------
014
015 Conversions between UTF32, UTF-16, and UTF-8. Header file.
016
017 Several funtions are included here, forming a complete set of
018 conversions between the three formats. UTF-7 is not included
019 here, but is handled in a separate source file.
020
021 Each of these routines takes pointers to input buffers and output
022 buffers. The input buffers are const.
023
024 Each routine converts the text between *sourceStart and sourceEnd,
025 putting the result into the buffer between *targetStart and
026 targetEnd. Note: the end pointers are *after* the last item: e.g.
027 *(sourceEnd - 1) is the last item.
028
029 The return result indicates whether the conversion was successful,
030 and if not, whether the problem was in the source or target buffers.
031 (Only the first encountered problem is indicated.)
032
033 After the conversion, *sourceStart and *targetStart are both
034 updated to point to the end of last text successfully converted in
035 the respective buffers.
036
037 Input parameters:
038 sourceStart - pointer to a pointer to the source buffer.
039 The contents of this are modified on return so that
040 it points at the next thing to be converted.
041 targetStart - similarly, pointer to pointer to the target buffer.
042 sourceEnd, targetEnd - respectively pointers to the ends of the
043 two buffers, for overflow checking only.
044
045 These conversion functions take a ConversionFlags argument. When this
046 flag is set to strict, both irregular sequences and isolated surrogates
047 will cause an error. When the flag is set to lenient, both irregular
048 sequences and isolated surrogates are converted.
049
050 Whether the flag is strict or lenient, all illegal sequences will cause
051 an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
052 or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
053 must check for illegal sequences.
054
055 When the flag is set to lenient, characters over 0x10FFFF are converted
056 to the replacement character; otherwise (when the flag is set to strict)
057 they constitute an error.
058
059 Output parameters:
060 The value "sourceIllegal" is returned from some routines if the input
061 sequence is malformed. When "sourceIllegal" is returned, the source
062 value will point to the illegal value that caused the problem. E.g.,
063 in UTF-8 when a sequence is malformed, it points to the start of the
064 malformed sequence.
065
066 Author: Mark E. Davis, 1994.
067 Rev History: Rick McGowan, fixes & updates May 2001.
068 Fixes & updates, Sept 2001.
069
070 ------------------------------------------------------------------------ */
071
072 /* ---------------------------------------------------------------------
073 The following 4 definitions are compiler-specific.
074 The C standard does not guarantee that wchar_t has at least
075 16 bits, so wchar_t is no less portable than unsigned short!
076 All should be unsigned values to avoid sign extension during
077 bit mask & shift operations.
078 ------------------------------------------------------------------------ */
079
080 typedef unsigned long UTF32; /* at least 32 bits */
081 typedef unsigned short UTF16; /* at least 16 bits */
082 typedef unsigned char UTF8; /* typically 8 bits */
083 typedef unsigned char Boolean; /* 0 or 1 */
084
085 /* Some fundamental constants */
086 #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
087 #define UNI_MAX_BMP (UTF32)0x0000FFFF
088 #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
089 #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
090 #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
091
092 typedef enum
093 {
094 conversionOK,
095 /* conversion successful */
096 sourceExhausted,
097 /* partial character in source, but hit end */
098 targetExhausted,
099 /* insuff. room in target for conversion */
100 sourceIllegal /* source sequence is illegal/malformed */
101 } ConversionResult;
102
103 typedef enum
104 {
105 strictConversion = 0,
106 lenientConversion
107 } ConversionFlags;
108
109 /* This is for C++ and does no harm in C */
110 #ifdef __cplusplus
111 extern "C"
112 {
113 #endif
114
115 ConversionResult ConvertUTF8toUTF16(const UTF8** sourceStart,
116 const UTF8* sourceEnd, UTF16** targetStart, UTF16* targetEnd,
117 ConversionFlags flags);
118
119 ConversionResult ConvertUTF16toUTF8(const UTF16** sourceStart,
120 const UTF16* sourceEnd, UTF8** targetStart, UTF8* targetEnd,
121 ConversionFlags flags);
122
123 ConversionResult ConvertUTF8toUTF32(const UTF8** sourceStart,
124 const UTF8* sourceEnd, UTF32** targetStart, UTF32* targetEnd,
125 ConversionFlags flags);
126
127 ConversionResult ConvertUTF32toUTF8(const UTF32** sourceStart,
128 const UTF32* sourceEnd, UTF8** targetStart, UTF8* targetEnd,
129 ConversionFlags flags);
130
131 ConversionResult ConvertUTF16toUTF32(const UTF16** sourceStart,
132 const UTF16* sourceEnd, UTF32** targetStart, UTF32* targetEnd,
133 ConversionFlags flags);
134
135 ConversionResult ConvertUTF32toUTF16(const UTF32** sourceStart,
136 const UTF32* sourceEnd, UTF16** targetStart, UTF16* targetEnd,
137 ConversionFlags flags);
138
139 Boolean isLegalUTF8Sequence(const UTF8* source, const UTF8* sourceEnd);
140
141 #ifdef __cplusplus
142 }
143 #endif
144
145
146
147
148
149 ////////////////////////////////////////////////////////////////////////////////////////////////
150 //
151 //convertUTF.c
152 //
153
154 #include "ConvertUTF.h"
155 #ifdef CVTUTF_DEBUG
156 #include <stdio.h>
157 #endif
158
159 static const int halfShift = 10; /* used for shifting by 10 bits */
160
161 static const UTF32 halfBase = 0x0010000UL;
162 static const UTF32 halfMask = 0x3FFUL;
163
164 #define UNI_SUR_HIGH_START (UTF32)0xD800
165 #define UNI_SUR_HIGH_END (UTF32)0xDBFF
166 #define UNI_SUR_LOW_START (UTF32)0xDC00
167 #define UNI_SUR_LOW_END (UTF32)0xDFFF
168 #define false 0
169 #define true 1
170
171 /* --------------------------------------------------------------------- */
172
173 ConversionResult ConvertUTF32toUTF16(const UTF32** sourceStart,
174 const UTF32* sourceEnd, UTF16** targetStart, UTF16* targetEnd,
175 ConversionFlags flags)
176 {
177 ConversionResult result = conversionOK;
178 const UTF32* source = *sourceStart;
179 UTF16* target = *targetStart;
180 while (source < sourceEnd)
181 {
182 UTF32 ch;
183 if (target >= targetEnd)
184 {
185 result = targetExhausted; break;
186 }
187 ch = *source++;
188 if (ch <= UNI_MAX_BMP)
189 {
190 /* Target is a character <= 0xFFFF */
191 /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
192 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
193 {
194 if (flags == strictConversion)
195 {
196 --source; /* return to the illegal value itself */
197 result = sourceIllegal;
198 break;
199 }
200 else
201 {
202 *target++ = UNI_REPLACEMENT_CHAR;
203 }
204 }
205 else
206 {
207 *target++ = (UTF16) ch; /* normal case */
208 }
209 }
210 else if (ch > UNI_MAX_LEGAL_UTF32)
211 {
212 if (flags == strictConversion)
213 {
214 result = sourceIllegal;
215 }
216 else
217 {
218 *target++ = UNI_REPLACEMENT_CHAR;
219 }
220 }
221 else
222 {
223 /* target is a character in range 0xFFFF - 0x10FFFF. */
224 if (target + 1 >= targetEnd)
225 {
226 --source; /* Back up source pointer! */
227 result = targetExhausted; break;
228 }
229 ch -= halfBase;
230 *target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
231 *target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
232 }
233 }
234 *sourceStart = source;
235 *targetStart = target;
236 return result;
237 }
238
239 /* --------------------------------------------------------------------- */
240
241 ConversionResult ConvertUTF16toUTF32(const UTF16** sourceStart,
242 const UTF16* sourceEnd, UTF32** targetStart, UTF32* targetEnd,
243 ConversionFlags flags)
244 {
245 ConversionResult result = conversionOK;
246 const UTF16* source = *sourceStart;
247 UTF32* target = *targetStart;
248 UTF32 ch, ch2;
249 while (source < sourceEnd)
250 {
251 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
252 ch = *source++;
253 /* If we have a surrogate pair, convert to UTF32 first. */
254 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
255 {
256 /* If the 16 bits following the high surrogate are in the source buffer... */
257 if (source < sourceEnd)
258 {
259 ch2 = *source;
260 /* If it's a low surrogate, convert to UTF32. */
261 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
262 {
263 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) +
264 (ch2 - UNI_SUR_LOW_START) +
265 halfBase;
266 ++source;
267 }
268 else if (flags == strictConversion)
269 {
270 /* it's an unpaired high surrogate */
271 --source; /* return to the illegal value itself */
272 result = sourceIllegal;
273 break;
274 }
275 }
276 else
277 {
278 /* We don't have the 16 bits following the high surrogate. */
279 --source; /* return to the high surrogate */
280 result = sourceExhausted;
281 break;
282 }
283 }
284 else if (flags == strictConversion)
285 {
286 /* UTF-16 surrogate values are illegal in UTF-32 */
287 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
288 {
289 --source; /* return to the illegal value itself */
290 result = sourceIllegal;
291 break;
292 }
293 }
294 if (target >= targetEnd)
295 {
296 source = oldSource; /* Back up source pointer! */
297 result = targetExhausted; break;
298 }
299 *target++ = ch;
300 }
301 *sourceStart = source;
302 *targetStart = target;
303 #ifdef CVTUTF_DEBUG
304 if (result == sourceIllegal)
305 {
306 fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch,
307 ch2);
308 fflush(stderr);
309 }
310 #endif
311 return result;
312 }
313
314 /* --------------------------------------------------------------------- */
315
316 /*
317 * Index into the table below with the first byte of a UTF-8 sequence to
318 * get the number of trailing bytes that are supposed to follow it.
319 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
320 * left as-is for anyone who may want to do such conversion, which was
321 * allowed in earlier algorithms.
322 */
323 static const char trailingBytesForUTF8[256] =
324 {
325 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
326 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
327 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
328 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
329 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
330 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
331 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
332 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
333 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
334 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4,
335 4, 4, 5, 5, 5, 5
336 };
337
338 /*
339 * Magic values subtracted from a buffer value during UTF8 conversion.
340 * This table contains as many values as there might be trailing bytes
341 * in a UTF-8 sequence.
342 */
343 static const UTF32 offsetsFromUTF8[6] =
344 {
345 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, 0xFA082080UL,
346 0x82082080UL
347 };
348
349 /*
350 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
351 * into the first byte, depending on how many bytes follow. There are
352 * as many entries in this table as there are UTF-8 sequence types.
353 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
354 * for *legal* UTF-8 will be 4 or fewer bytes total.
355 */
356 static const UTF8 firstByteMark[7] =
357 {
358 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
359 };
360
361 /* --------------------------------------------------------------------- */
362
363 /* The interface converts a whole buffer to avoid function-call overhead.
364 * Constants have been gathered. Loops & conditionals have been removed as
365 * much as possible for efficiency, in favor of drop-through switches.
366 * (See "Note A" at the bottom of the file for equivalent code.)
367 * If your compiler supports it, the "isLegalUTF8" call can be turned
368 * into an inline function.
369 */
370
371 /* --------------------------------------------------------------------- */
372
373 ConversionResult ConvertUTF16toUTF8(const UTF16** sourceStart,
374 const UTF16* sourceEnd, UTF8** targetStart, UTF8* targetEnd,
375 ConversionFlags flags)
376 {
377 ConversionResult result = conversionOK;
378 const UTF16* source = *sourceStart;
379 UTF8* target = *targetStart;
380 while (source < sourceEnd)
381 {
382 UTF32 ch;
383 unsigned short bytesToWrite = 0;
384 const UTF32 byteMask = 0xBF;
385 const UTF32 byteMark = 0x80;
386 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
387 ch = *source++;
388 /* If we have a surrogate pair, convert to UTF32 first. */
389 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
390 {
391 /* If the 16 bits following the high surrogate are in the source buffer... */
392 if (source < sourceEnd)
393 {
394 UTF32 ch2 = *source;
395 /* If it's a low surrogate, convert to UTF32. */
396 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
397 {
398 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) +
399 (ch2 - UNI_SUR_LOW_START) +
400 halfBase;
401 ++source;
402 }
403 else if (flags == strictConversion)
404 {
405 /* it's an unpaired high surrogate */
406 --source; /* return to the illegal value itself */
407 result = sourceIllegal;
408 break;
409 }
410 }
411 else
412 {
413 /* We don't have the 16 bits following the high surrogate. */
414 --source; /* return to the high surrogate */
415 result = sourceExhausted;
416 break;
417 }
418 }
419 else if (flags == strictConversion)
420 {
421 /* UTF-16 surrogate values are illegal in UTF-32 */
422 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
423 {
424 --source; /* return to the illegal value itself */
425 result = sourceIllegal;
426 break;
427 }
428 }
429 /* Figure out how many bytes the result will require */
430 if (ch < (UTF32) 0x80)
431 {
432 bytesToWrite = 1;
433 }
434 else if (ch < (UTF32) 0x800)
435 {
436 bytesToWrite = 2;
437 }
438 else if (ch < (UTF32) 0x10000)
439 {
440 bytesToWrite = 3;
441 }
442 else if (ch < (UTF32) 0x110000)
443 {
444 bytesToWrite = 4;
445 }
446 else
447 {
448 bytesToWrite = 3;
449 ch = UNI_REPLACEMENT_CHAR;
450 }
451
452 target += bytesToWrite;
453 if (target > targetEnd)
454 {
455 source = oldSource; /* Back up source pointer! */
456 target -= bytesToWrite; result = targetExhausted; break;
457 }
458 switch (bytesToWrite)
459 {
460 /* note: everything falls through. */
461 case 4:
462 *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
463 case 3:
464 *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
465 case 2:
466 *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
467 case 1:
468 *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
469 }
470 target += bytesToWrite;
471 }
472 *sourceStart = source;
473 *targetStart = target;
474 return result;
475 }
476
477 /* --------------------------------------------------------------------- */
478
479 /*
480 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
481 * This must be called with the length pre-determined by the first byte.
482 * If not calling this from ConvertUTF8to*, then the length can be set by:
483 * length = trailingBytesForUTF8[*source]+1;
484 * and the sequence is illegal right away if there aren't that many bytes
485 * available.
486 * If presented with a length > 4, this returns false. The Unicode
487 * definition of UTF-8 goes up to 4-byte sequences.
488 */
489
490 static Boolean isLegalUTF8(const UTF8* source, int length)
491 {
492 UTF8 a;
493 const UTF8* srcptr = source + length;
494 switch (length)
495 {
496 default:
497 return false;
498 /* Everything else falls through when "true"... */
499 case 4:
500 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
501 return false;
502 case 3:
503 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
504 return false;
505 case 2:
506 if ((a = (*--srcptr)) > 0xBF)
507 return false;
508
509 switch (*source)
510 {
511 /* no fall-through in this inner switch */
512 case 0xE0:
513 if (a < 0xA0)
514 return false; break;
515 case 0xED:
516 if (a > 0x9F)
517 return false; break;
518 case 0xF0:
519 if (a < 0x90)
520 return false; break;
521 case 0xF4:
522 if (a > 0x8F)
523 return false; break;
524 default:
525 if (a < 0x80)
526 return false;
527 }
528
529 case 1:
530 if (*source >= 0x80 && *source < 0xC2)
531 return false;
532 }
533 if (*source > 0xF4)
534 return false;
535 return true;
536 }
537
538 /* --------------------------------------------------------------------- */
539
540 /*
541 * Exported function to return whether a UTF-8 sequence is legal or not.
542 * This is not used here; it's just exported.
543 */
544 Boolean isLegalUTF8Sequence(const UTF8* source, const UTF8* sourceEnd)
545 {
546 int length = trailingBytesForUTF8[*source] + 1;
547 if (source + length > sourceEnd)
548 {
549 return false;
550 }
551 return isLegalUTF8(source, length);
552 }
553
554 /* --------------------------------------------------------------------- */
555
556 ConversionResult ConvertUTF8toUTF16(const UTF8** sourceStart,
557 const UTF8* sourceEnd, UTF16** targetStart, UTF16* targetEnd,
558 ConversionFlags flags)
559 {
560 ConversionResult result = conversionOK;
561 const UTF8* source = *sourceStart;
562 UTF16* target = *targetStart;
563 while (source < sourceEnd)
564 {
565 UTF32 ch = 0;
566 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
567 if (source + extraBytesToRead >= sourceEnd)
568 {
569 result = sourceExhausted; break;
570 }
571 /* Do this check whether lenient or strict */
572 if (!isLegalUTF8(source, extraBytesToRead + 1))
573 {
574 result = sourceIllegal;
575 break;
576 }
577 /*
578 * The cases all fall through. See "Note A" below.
579 */
580 switch (extraBytesToRead)
581 {
582 case 5:
583 ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
584 case 4:
585 ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
586 case 3:
587 ch += *source++; ch <<= 6;
588 case 2:
589 ch += *source++; ch <<= 6;
590 case 1:
591 ch += *source++; ch <<= 6;
592 case 0:
593 ch += *source++;
594 }
595 ch -= offsetsFromUTF8[extraBytesToRead];
596
597 if (target >= targetEnd)
598 {
599 source -= (extraBytesToRead + 1); /* Back up source pointer! */
600 result = targetExhausted; break;
601 }
602 if (ch <= UNI_MAX_BMP)
603 {
604 /* Target is a character <= 0xFFFF */
605 /* UTF-16 surrogate values are illegal in UTF-32 */
606 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
607 {
608 if (flags == strictConversion)
609 {
610 source -= (extraBytesToRead + 1); /* return to the illegal value itself */
611 result = sourceIllegal;
612 break;
613 }
614 else
615 {
616 *target++ = UNI_REPLACEMENT_CHAR;
617 }
618 }
619 else
620 {
621 *target++ = (UTF16) ch; /* normal case */
622 }
623 }
624 else if (ch > UNI_MAX_UTF16)
625 {
626 if (flags == strictConversion)
627 {
628 result = sourceIllegal;
629 source -= (extraBytesToRead + 1); /* return to the start */
630 break; /* Bail out; shouldn't continue */
631 }
632 else
633 {
634 *target++ = UNI_REPLACEMENT_CHAR;
635 }
636 }
637 else
638 {
639 /* target is a character in range 0xFFFF - 0x10FFFF. */
640 if (target + 1 >= targetEnd)
641 {
642 source -= (extraBytesToRead + 1); /* Back up source pointer! */
643 result = targetExhausted; break;
644 }
645 ch -= halfBase;
646 *target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
647 *target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
648 }
649 }
650 *sourceStart = source;
651 *targetStart = target;
652 return result;
653 }
654
655 /* --------------------------------------------------------------------- */
656
657 ConversionResult ConvertUTF32toUTF8(const UTF32** sourceStart,
658 const UTF32* sourceEnd, UTF8** targetStart, UTF8* targetEnd,
659 ConversionFlags flags)
660 {
661 ConversionResult result = conversionOK;
662 const UTF32* source = *sourceStart;
663 UTF8* target = *targetStart;
664 while (source < sourceEnd)
665 {
666 UTF32 ch;
667 unsigned short bytesToWrite = 0;
668 const UTF32 byteMask = 0xBF;
669 const UTF32 byteMark = 0x80;
670 ch = *source++;
671 if (flags == strictConversion)
672 {
673 /* UTF-16 surrogate values are illegal in UTF-32 */
674 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
675 {
676 --source; /* return to the illegal value itself */
677 result = sourceIllegal;
678 break;
679 }
680 }
681 /*
682 * Figure out how many bytes the result will require. Turn any
683 * illegally large UTF32 things (> Plane 17) into replacement chars.
684 */
685 if (ch < (UTF32) 0x80)
686 {
687 bytesToWrite = 1;
688 }
689 else if (ch < (UTF32) 0x800)
690 {
691 bytesToWrite = 2;
692 }
693 else if (ch < (UTF32) 0x10000)
694 {
695 bytesToWrite = 3;
696 }
697 else if (ch <= UNI_MAX_LEGAL_UTF32)
698 {
699 bytesToWrite = 4;
700 }
701 else
702 {
703 bytesToWrite = 3;
704 ch = UNI_REPLACEMENT_CHAR;
705 result = sourceIllegal;
706 }
707
708 target += bytesToWrite;
709 if (target > targetEnd)
710 {
711 --source; /* Back up source pointer! */
712 target -= bytesToWrite; result = targetExhausted; break;
713 }
714 switch (bytesToWrite)
715 {
716 /* note: everything falls through. */
717 case 4:
718 *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
719 case 3:
720 *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
721 case 2:
722 *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
723 case 1:
724 *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
725 }
726 target += bytesToWrite;
727 }
728 *sourceStart = source;
729 *targetStart = target;
730 return result;
731 }
732
733 /* --------------------------------------------------------------------- */
734
735 ConversionResult ConvertUTF8toUTF32(const UTF8** sourceStart,
736 const UTF8* sourceEnd, UTF32** targetStart, UTF32* targetEnd,
737 ConversionFlags flags)
738 {
739 ConversionResult result = conversionOK;
740 const UTF8* source = *sourceStart;
741 UTF32* target = *targetStart;
742 while (source < sourceEnd)
743 {
744 UTF32 ch = 0;
745 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
746 if (source + extraBytesToRead >= sourceEnd)
747 {
748 result = sourceExhausted; break;
749 }
750 /* Do this check whether lenient or strict */
751 if (!isLegalUTF8(source, extraBytesToRead + 1))
752 {
753 result = sourceIllegal;
754 break;
755 }
756 /*
757 * The cases all fall through. See "Note A" below.
758 */
759 switch (extraBytesToRead)
760 {
761 case 5:
762 ch += *source++; ch <<= 6;
763 case 4:
764 ch += *source++; ch <<= 6;
765 case 3:
766 ch += *source++; ch <<= 6;
767 case 2:
768 ch += *source++; ch <<= 6;
769 case 1:
770 ch += *source++; ch <<= 6;
771 case 0:
772 ch += *source++;
773 }
774 ch -= offsetsFromUTF8[extraBytesToRead];
775
776 if (target >= targetEnd)
777 {
778 source -= (extraBytesToRead + 1); /* Back up the source pointer! */
779 result = targetExhausted; break;
780 }
781 if (ch <= UNI_MAX_LEGAL_UTF32)
782 {
783 /*
784 * UTF-16 surrogate values are illegal in UTF-32, and anything
785 * over Plane 17 (> 0x10FFFF) is illegal.
786 */
787 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
788 {
789 if (flags == strictConversion)
790 {
791 source -= (extraBytesToRead + 1); /* return to the illegal value itself */
792 result = sourceIllegal;
793 break;
794 }
795 else
796 {
797 *target++ = UNI_REPLACEMENT_CHAR;
798 }
799 }
800 else
801 {
802 *target++ = ch;
803 }
804 }
805 else
806 {
807 /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
808 result = sourceIllegal;
809 *target++ = UNI_REPLACEMENT_CHAR;
810 }
811 }
812 *sourceStart = source;
813 *targetStart = target;
814 return result;
815 }
002 //convertUTF.h
003 //
004
005
006 /*
007 * Copyright 2001-2004 Unicode, Inc.
008 *
009 * Disclaimer
010 *
011 */
012
013 /* ---------------------------------------------------------------------
014
015 Conversions between UTF32, UTF-16, and UTF-8. Header file.
016
017 Several funtions are included here, forming a complete set of
018 conversions between the three formats. UTF-7 is not included
019 here, but is handled in a separate source file.
020
021 Each of these routines takes pointers to input buffers and output
022 buffers. The input buffers are const.
023
024 Each routine converts the text between *sourceStart and sourceEnd,
025 putting the result into the buffer between *targetStart and
026 targetEnd. Note: the end pointers are *after* the last item: e.g.
027 *(sourceEnd - 1) is the last item.
028
029 The return result indicates whether the conversion was successful,
030 and if not, whether the problem was in the source or target buffers.
031 (Only the first encountered problem is indicated.)
032
033 After the conversion, *sourceStart and *targetStart are both
034 updated to point to the end of last text successfully converted in
035 the respective buffers.
036
037 Input parameters:
038 sourceStart - pointer to a pointer to the source buffer.
039 The contents of this are modified on return so that
040 it points at the next thing to be converted.
041 targetStart - similarly, pointer to pointer to the target buffer.
042 sourceEnd, targetEnd - respectively pointers to the ends of the
043 two buffers, for overflow checking only.
044
045 These conversion functions take a ConversionFlags argument. When this
046 flag is set to strict, both irregular sequences and isolated surrogates
047 will cause an error. When the flag is set to lenient, both irregular
048 sequences and isolated surrogates are converted.
049
050 Whether the flag is strict or lenient, all illegal sequences will cause
051 an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
052 or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
053 must check for illegal sequences.
054
055 When the flag is set to lenient, characters over 0x10FFFF are converted
056 to the replacement character; otherwise (when the flag is set to strict)
057 they constitute an error.
058
059 Output parameters:
060 The value "sourceIllegal" is returned from some routines if the input
061 sequence is malformed. When "sourceIllegal" is returned, the source
062 value will point to the illegal value that caused the problem. E.g.,
063 in UTF-8 when a sequence is malformed, it points to the start of the
064 malformed sequence.
065
066 Author: Mark E. Davis, 1994.
067 Rev History: Rick McGowan, fixes & updates May 2001.
068 Fixes & updates, Sept 2001.
069
070 ------------------------------------------------------------------------ */
071
072 /* ---------------------------------------------------------------------
073 The following 4 definitions are compiler-specific.
074 The C standard does not guarantee that wchar_t has at least
075 16 bits, so wchar_t is no less portable than unsigned short!
076 All should be unsigned values to avoid sign extension during
077 bit mask & shift operations.
078 ------------------------------------------------------------------------ */
079
080 typedef unsigned long UTF32; /* at least 32 bits */
081 typedef unsigned short UTF16; /* at least 16 bits */
082 typedef unsigned char UTF8; /* typically 8 bits */
083 typedef unsigned char Boolean; /* 0 or 1 */
084
085 /* Some fundamental constants */
086 #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
087 #define UNI_MAX_BMP (UTF32)0x0000FFFF
088 #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
089 #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
090 #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
091
092 typedef enum
093 {
094 conversionOK,
095 /* conversion successful */
096 sourceExhausted,
097 /* partial character in source, but hit end */
098 targetExhausted,
099 /* insuff. room in target for conversion */
100 sourceIllegal /* source sequence is illegal/malformed */
101 } ConversionResult;
102
103 typedef enum
104 {
105 strictConversion = 0,
106 lenientConversion
107 } ConversionFlags;
108
109 /* This is for C++ and does no harm in C */
110 #ifdef __cplusplus
111 extern "C"
112 {
113 #endif
114
115 ConversionResult ConvertUTF8toUTF16(const UTF8** sourceStart,
116 const UTF8* sourceEnd, UTF16** targetStart, UTF16* targetEnd,
117 ConversionFlags flags);
118
119 ConversionResult ConvertUTF16toUTF8(const UTF16** sourceStart,
120 const UTF16* sourceEnd, UTF8** targetStart, UTF8* targetEnd,
121 ConversionFlags flags);
122
123 ConversionResult ConvertUTF8toUTF32(const UTF8** sourceStart,
124 const UTF8* sourceEnd, UTF32** targetStart, UTF32* targetEnd,
125 ConversionFlags flags);
126
127 ConversionResult ConvertUTF32toUTF8(const UTF32** sourceStart,
128 const UTF32* sourceEnd, UTF8** targetStart, UTF8* targetEnd,
129 ConversionFlags flags);
130
131 ConversionResult ConvertUTF16toUTF32(const UTF16** sourceStart,
132 const UTF16* sourceEnd, UTF32** targetStart, UTF32* targetEnd,
133 ConversionFlags flags);
134
135 ConversionResult ConvertUTF32toUTF16(const UTF32** sourceStart,
136 const UTF32* sourceEnd, UTF16** targetStart, UTF16* targetEnd,
137 ConversionFlags flags);
138
139 Boolean isLegalUTF8Sequence(const UTF8* source, const UTF8* sourceEnd);
140
141 #ifdef __cplusplus
142 }
143 #endif
144
145
146
147
148
149 ////////////////////////////////////////////////////////////////////////////////////////////////
150 //
151 //convertUTF.c
152 //
153
154 #include "ConvertUTF.h"
155 #ifdef CVTUTF_DEBUG
156 #include <stdio.h>
157 #endif
158
159 static const int halfShift = 10; /* used for shifting by 10 bits */
160
161 static const UTF32 halfBase = 0x0010000UL;
162 static const UTF32 halfMask = 0x3FFUL;
163
164 #define UNI_SUR_HIGH_START (UTF32)0xD800
165 #define UNI_SUR_HIGH_END (UTF32)0xDBFF
166 #define UNI_SUR_LOW_START (UTF32)0xDC00
167 #define UNI_SUR_LOW_END (UTF32)0xDFFF
168 #define false 0
169 #define true 1
170
171 /* --------------------------------------------------------------------- */
172
173 ConversionResult ConvertUTF32toUTF16(const UTF32** sourceStart,
174 const UTF32* sourceEnd, UTF16** targetStart, UTF16* targetEnd,
175 ConversionFlags flags)
176 {
177 ConversionResult result = conversionOK;
178 const UTF32* source = *sourceStart;
179 UTF16* target = *targetStart;
180 while (source < sourceEnd)
181 {
182 UTF32 ch;
183 if (target >= targetEnd)
184 {
185 result = targetExhausted; break;
186 }
187 ch = *source++;
188 if (ch <= UNI_MAX_BMP)
189 {
190 /* Target is a character <= 0xFFFF */
191 /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
192 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
193 {
194 if (flags == strictConversion)
195 {
196 --source; /* return to the illegal value itself */
197 result = sourceIllegal;
198 break;
199 }
200 else
201 {
202 *target++ = UNI_REPLACEMENT_CHAR;
203 }
204 }
205 else
206 {
207 *target++ = (UTF16) ch; /* normal case */
208 }
209 }
210 else if (ch > UNI_MAX_LEGAL_UTF32)
211 {
212 if (flags == strictConversion)
213 {
214 result = sourceIllegal;
215 }
216 else
217 {
218 *target++ = UNI_REPLACEMENT_CHAR;
219 }
220 }
221 else
222 {
223 /* target is a character in range 0xFFFF - 0x10FFFF. */
224 if (target + 1 >= targetEnd)
225 {
226 --source; /* Back up source pointer! */
227 result = targetExhausted; break;
228 }
229 ch -= halfBase;
230 *target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
231 *target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
232 }
233 }
234 *sourceStart = source;
235 *targetStart = target;
236 return result;
237 }
238
239 /* --------------------------------------------------------------------- */
240
241 ConversionResult ConvertUTF16toUTF32(const UTF16** sourceStart,
242 const UTF16* sourceEnd, UTF32** targetStart, UTF32* targetEnd,
243 ConversionFlags flags)
244 {
245 ConversionResult result = conversionOK;
246 const UTF16* source = *sourceStart;
247 UTF32* target = *targetStart;
248 UTF32 ch, ch2;
249 while (source < sourceEnd)
250 {
251 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
252 ch = *source++;
253 /* If we have a surrogate pair, convert to UTF32 first. */
254 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
255 {
256 /* If the 16 bits following the high surrogate are in the source buffer... */
257 if (source < sourceEnd)
258 {
259 ch2 = *source;
260 /* If it's a low surrogate, convert to UTF32. */
261 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
262 {
263 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) +
264 (ch2 - UNI_SUR_LOW_START) +
265 halfBase;
266 ++source;
267 }
268 else if (flags == strictConversion)
269 {
270 /* it's an unpaired high surrogate */
271 --source; /* return to the illegal value itself */
272 result = sourceIllegal;
273 break;
274 }
275 }
276 else
277 {
278 /* We don't have the 16 bits following the high surrogate. */
279 --source; /* return to the high surrogate */
280 result = sourceExhausted;
281 break;
282 }
283 }
284 else if (flags == strictConversion)
285 {
286 /* UTF-16 surrogate values are illegal in UTF-32 */
287 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
288 {
289 --source; /* return to the illegal value itself */
290 result = sourceIllegal;
291 break;
292 }
293 }
294 if (target >= targetEnd)
295 {
296 source = oldSource; /* Back up source pointer! */
297 result = targetExhausted; break;
298 }
299 *target++ = ch;
300 }
301 *sourceStart = source;
302 *targetStart = target;
303 #ifdef CVTUTF_DEBUG
304 if (result == sourceIllegal)
305 {
306 fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch,
307 ch2);
308 fflush(stderr);
309 }
310 #endif
311 return result;
312 }
313
314 /* --------------------------------------------------------------------- */
315
316 /*
317 * Index into the table below with the first byte of a UTF-8 sequence to
318 * get the number of trailing bytes that are supposed to follow it.
319 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
320 * left as-is for anyone who may want to do such conversion, which was
321 * allowed in earlier algorithms.
322 */
323 static const char trailingBytesForUTF8[256] =
324 {
325 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
326 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
327 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
328 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
329 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
330 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
331 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
332 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
333 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
334 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4,
335 4, 4, 5, 5, 5, 5
336 };
337
338 /*
339 * Magic values subtracted from a buffer value during UTF8 conversion.
340 * This table contains as many values as there might be trailing bytes
341 * in a UTF-8 sequence.
342 */
343 static const UTF32 offsetsFromUTF8[6] =
344 {
345 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, 0xFA082080UL,
346 0x82082080UL
347 };
348
349 /*
350 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
351 * into the first byte, depending on how many bytes follow. There are
352 * as many entries in this table as there are UTF-8 sequence types.
353 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
354 * for *legal* UTF-8 will be 4 or fewer bytes total.
355 */
356 static const UTF8 firstByteMark[7] =
357 {
358 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
359 };
360
361 /* --------------------------------------------------------------------- */
362
363 /* The interface converts a whole buffer to avoid function-call overhead.
364 * Constants have been gathered. Loops & conditionals have been removed as
365 * much as possible for efficiency, in favor of drop-through switches.
366 * (See "Note A" at the bottom of the file for equivalent code.)
367 * If your compiler supports it, the "isLegalUTF8" call can be turned
368 * into an inline function.
369 */
370
371 /* --------------------------------------------------------------------- */
372
373 ConversionResult ConvertUTF16toUTF8(const UTF16** sourceStart,
374 const UTF16* sourceEnd, UTF8** targetStart, UTF8* targetEnd,
375 ConversionFlags flags)
376 {
377 ConversionResult result = conversionOK;
378 const UTF16* source = *sourceStart;
379 UTF8* target = *targetStart;
380 while (source < sourceEnd)
381 {
382 UTF32 ch;
383 unsigned short bytesToWrite = 0;
384 const UTF32 byteMask = 0xBF;
385 const UTF32 byteMark = 0x80;
386 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
387 ch = *source++;
388 /* If we have a surrogate pair, convert to UTF32 first. */
389 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
390 {
391 /* If the 16 bits following the high surrogate are in the source buffer... */
392 if (source < sourceEnd)
393 {
394 UTF32 ch2 = *source;
395 /* If it's a low surrogate, convert to UTF32. */
396 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
397 {
398 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) +
399 (ch2 - UNI_SUR_LOW_START) +
400 halfBase;
401 ++source;
402 }
403 else if (flags == strictConversion)
404 {
405 /* it's an unpaired high surrogate */
406 --source; /* return to the illegal value itself */
407 result = sourceIllegal;
408 break;
409 }
410 }
411 else
412 {
413 /* We don't have the 16 bits following the high surrogate. */
414 --source; /* return to the high surrogate */
415 result = sourceExhausted;
416 break;
417 }
418 }
419 else if (flags == strictConversion)
420 {
421 /* UTF-16 surrogate values are illegal in UTF-32 */
422 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
423 {
424 --source; /* return to the illegal value itself */
425 result = sourceIllegal;
426 break;
427 }
428 }
429 /* Figure out how many bytes the result will require */
430 if (ch < (UTF32) 0x80)
431 {
432 bytesToWrite = 1;
433 }
434 else if (ch < (UTF32) 0x800)
435 {
436 bytesToWrite = 2;
437 }
438 else if (ch < (UTF32) 0x10000)
439 {
440 bytesToWrite = 3;
441 }
442 else if (ch < (UTF32) 0x110000)
443 {
444 bytesToWrite = 4;
445 }
446 else
447 {
448 bytesToWrite = 3;
449 ch = UNI_REPLACEMENT_CHAR;
450 }
451
452 target += bytesToWrite;
453 if (target > targetEnd)
454 {
455 source = oldSource; /* Back up source pointer! */
456 target -= bytesToWrite; result = targetExhausted; break;
457 }
458 switch (bytesToWrite)
459 {
460 /* note: everything falls through. */
461 case 4:
462 *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
463 case 3:
464 *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
465 case 2:
466 *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
467 case 1:
468 *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
469 }
470 target += bytesToWrite;
471 }
472 *sourceStart = source;
473 *targetStart = target;
474 return result;
475 }
476
477 /* --------------------------------------------------------------------- */
478
479 /*
480 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
481 * This must be called with the length pre-determined by the first byte.
482 * If not calling this from ConvertUTF8to*, then the length can be set by:
483 * length = trailingBytesForUTF8[*source]+1;
484 * and the sequence is illegal right away if there aren't that many bytes
485 * available.
486 * If presented with a length > 4, this returns false. The Unicode
487 * definition of UTF-8 goes up to 4-byte sequences.
488 */
489
490 static Boolean isLegalUTF8(const UTF8* source, int length)
491 {
492 UTF8 a;
493 const UTF8* srcptr = source + length;
494 switch (length)
495 {
496 default:
497 return false;
498 /* Everything else falls through when "true"... */
499 case 4:
500 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
501 return false;
502 case 3:
503 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
504 return false;
505 case 2:
506 if ((a = (*--srcptr)) > 0xBF)
507 return false;
508
509 switch (*source)
510 {
511 /* no fall-through in this inner switch */
512 case 0xE0:
513 if (a < 0xA0)
514 return false; break;
515 case 0xED:
516 if (a > 0x9F)
517 return false; break;
518 case 0xF0:
519 if (a < 0x90)
520 return false; break;
521 case 0xF4:
522 if (a > 0x8F)
523 return false; break;
524 default:
525 if (a < 0x80)
526 return false;
527 }
528
529 case 1:
530 if (*source >= 0x80 && *source < 0xC2)
531 return false;
532 }
533 if (*source > 0xF4)
534 return false;
535 return true;
536 }
537
538 /* --------------------------------------------------------------------- */
539
540 /*
541 * Exported function to return whether a UTF-8 sequence is legal or not.
542 * This is not used here; it's just exported.
543 */
544 Boolean isLegalUTF8Sequence(const UTF8* source, const UTF8* sourceEnd)
545 {
546 int length = trailingBytesForUTF8[*source] + 1;
547 if (source + length > sourceEnd)
548 {
549 return false;
550 }
551 return isLegalUTF8(source, length);
552 }
553
554 /* --------------------------------------------------------------------- */
555
556 ConversionResult ConvertUTF8toUTF16(const UTF8** sourceStart,
557 const UTF8* sourceEnd, UTF16** targetStart, UTF16* targetEnd,
558 ConversionFlags flags)
559 {
560 ConversionResult result = conversionOK;
561 const UTF8* source = *sourceStart;
562 UTF16* target = *targetStart;
563 while (source < sourceEnd)
564 {
565 UTF32 ch = 0;
566 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
567 if (source + extraBytesToRead >= sourceEnd)
568 {
569 result = sourceExhausted; break;
570 }
571 /* Do this check whether lenient or strict */
572 if (!isLegalUTF8(source, extraBytesToRead + 1))
573 {
574 result = sourceIllegal;
575 break;
576 }
577 /*
578 * The cases all fall through. See "Note A" below.
579 */
580 switch (extraBytesToRead)
581 {
582 case 5:
583 ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
584 case 4:
585 ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
586 case 3:
587 ch += *source++; ch <<= 6;
588 case 2:
589 ch += *source++; ch <<= 6;
590 case 1:
591 ch += *source++; ch <<= 6;
592 case 0:
593 ch += *source++;
594 }
595 ch -= offsetsFromUTF8[extraBytesToRead];
596
597 if (target >= targetEnd)
598 {
599 source -= (extraBytesToRead + 1); /* Back up source pointer! */
600 result = targetExhausted; break;
601 }
602 if (ch <= UNI_MAX_BMP)
603 {
604 /* Target is a character <= 0xFFFF */
605 /* UTF-16 surrogate values are illegal in UTF-32 */
606 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
607 {
608 if (flags == strictConversion)
609 {
610 source -= (extraBytesToRead + 1); /* return to the illegal value itself */
611 result = sourceIllegal;
612 break;
613 }
614 else
615 {
616 *target++ = UNI_REPLACEMENT_CHAR;
617 }
618 }
619 else
620 {
621 *target++ = (UTF16) ch; /* normal case */
622 }
623 }
624 else if (ch > UNI_MAX_UTF16)
625 {
626 if (flags == strictConversion)
627 {
628 result = sourceIllegal;
629 source -= (extraBytesToRead + 1); /* return to the start */
630 break; /* Bail out; shouldn't continue */
631 }
632 else
633 {
634 *target++ = UNI_REPLACEMENT_CHAR;
635 }
636 }
637 else
638 {
639 /* target is a character in range 0xFFFF - 0x10FFFF. */
640 if (target + 1 >= targetEnd)
641 {
642 source -= (extraBytesToRead + 1); /* Back up source pointer! */
643 result = targetExhausted; break;
644 }
645 ch -= halfBase;
646 *target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
647 *target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
648 }
649 }
650 *sourceStart = source;
651 *targetStart = target;
652 return result;
653 }
654
655 /* --------------------------------------------------------------------- */
656
657 ConversionResult ConvertUTF32toUTF8(const UTF32** sourceStart,
658 const UTF32* sourceEnd, UTF8** targetStart, UTF8* targetEnd,
659 ConversionFlags flags)
660 {
661 ConversionResult result = conversionOK;
662 const UTF32* source = *sourceStart;
663 UTF8* target = *targetStart;
664 while (source < sourceEnd)
665 {
666 UTF32 ch;
667 unsigned short bytesToWrite = 0;
668 const UTF32 byteMask = 0xBF;
669 const UTF32 byteMark = 0x80;
670 ch = *source++;
671 if (flags == strictConversion)
672 {
673 /* UTF-16 surrogate values are illegal in UTF-32 */
674 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
675 {
676 --source; /* return to the illegal value itself */
677 result = sourceIllegal;
678 break;
679 }
680 }
681 /*
682 * Figure out how many bytes the result will require. Turn any
683 * illegally large UTF32 things (> Plane 17) into replacement chars.
684 */
685 if (ch < (UTF32) 0x80)
686 {
687 bytesToWrite = 1;
688 }
689 else if (ch < (UTF32) 0x800)
690 {
691 bytesToWrite = 2;
692 }
693 else if (ch < (UTF32) 0x10000)
694 {
695 bytesToWrite = 3;
696 }
697 else if (ch <= UNI_MAX_LEGAL_UTF32)
698 {
699 bytesToWrite = 4;
700 }
701 else
702 {
703 bytesToWrite = 3;
704 ch = UNI_REPLACEMENT_CHAR;
705 result = sourceIllegal;
706 }
707
708 target += bytesToWrite;
709 if (target > targetEnd)
710 {
711 --source; /* Back up source pointer! */
712 target -= bytesToWrite; result = targetExhausted; break;
713 }
714 switch (bytesToWrite)
715 {
716 /* note: everything falls through. */
717 case 4:
718 *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
719 case 3:
720 *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
721 case 2:
722 *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
723 case 1:
724 *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
725 }
726 target += bytesToWrite;
727 }
728 *sourceStart = source;
729 *targetStart = target;
730 return result;
731 }
732
733 /* --------------------------------------------------------------------- */
734
735 ConversionResult ConvertUTF8toUTF32(const UTF8** sourceStart,
736 const UTF8* sourceEnd, UTF32** targetStart, UTF32* targetEnd,
737 ConversionFlags flags)
738 {
739 ConversionResult result = conversionOK;
740 const UTF8* source = *sourceStart;
741 UTF32* target = *targetStart;
742 while (source < sourceEnd)
743 {
744 UTF32 ch = 0;
745 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
746 if (source + extraBytesToRead >= sourceEnd)
747 {
748 result = sourceExhausted; break;
749 }
750 /* Do this check whether lenient or strict */
751 if (!isLegalUTF8(source, extraBytesToRead + 1))
752 {
753 result = sourceIllegal;
754 break;
755 }
756 /*
757 * The cases all fall through. See "Note A" below.
758 */
759 switch (extraBytesToRead)
760 {
761 case 5:
762 ch += *source++; ch <<= 6;
763 case 4:
764 ch += *source++; ch <<= 6;
765 case 3:
766 ch += *source++; ch <<= 6;
767 case 2:
768 ch += *source++; ch <<= 6;
769 case 1:
770 ch += *source++; ch <<= 6;
771 case 0:
772 ch += *source++;
773 }
774 ch -= offsetsFromUTF8[extraBytesToRead];
775
776 if (target >= targetEnd)
777 {
778 source -= (extraBytesToRead + 1); /* Back up the source pointer! */
779 result = targetExhausted; break;
780 }
781 if (ch <= UNI_MAX_LEGAL_UTF32)
782 {
783 /*
784 * UTF-16 surrogate values are illegal in UTF-32, and anything
785 * over Plane 17 (> 0x10FFFF) is illegal.
786 */
787 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
788 {
789 if (flags == strictConversion)
790 {
791 source -= (extraBytesToRead + 1); /* return to the illegal value itself */
792 result = sourceIllegal;
793 break;
794 }
795 else
796 {
797 *target++ = UNI_REPLACEMENT_CHAR;
798 }
799 }
800 else
801 {
802 *target++ = ch;
803 }
804 }
805 else
806 {
807 /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
808 result = sourceIllegal;
809 *target++ = UNI_REPLACEMENT_CHAR;
810 }
811 }
812 *sourceStart = source;
813 *targetStart = target;
814 return result;
815 }
所有评论,共0条:( 我也来说两句)
代码
