unicode.hpp
Go to the documentation of this file.
00001 /* 00002 Copyright 2005-2007 Adobe Systems Incorporated 00003 Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt 00004 or a copy at http://stlab.adobe.com/licenses.html) 00005 */ 00006 00007 /*************************************************************************************************/ 00008 00009 #ifndef ADOBE_UNICODE_HPP 00010 #define ADOBE_UNICODE_HPP 00011 00012 /*************************************************************************************************/ 00013 00014 #include <adobe/config.hpp> 00015 00016 #include <adobe/algorithm/for_each.hpp> 00017 00018 #include <boost/cstdint.hpp> 00019 #include <boost/utility/enable_if.hpp> 00020 00021 #include <vector> 00022 #include <cassert> 00023 #include <stdexcept> 00024 00025 /*************************************************************************************************/ 00026 00027 namespace adobe { 00028 00029 /*************************************************************************************************/ 00030 00031 #if !defined(ADOBE_NO_DOCUMENTATION) 00032 00033 /*************************************************************************************************/ 00034 00035 template <typename T> 00036 struct is_utf8_type 00037 { enum { value = sizeof(T) == 1 }; }; 00038 00039 /*************************************************************************************************/ 00040 00041 template <typename T> 00042 struct is_utf16_type 00043 { enum { value = sizeof(T) == 2 }; }; 00044 00045 /*************************************************************************************************/ 00046 00047 template <typename T> 00048 struct is_utf32_type 00049 { enum { value = sizeof(T) == 4 }; }; 00050 00051 /*************************************************************************************************/ 00052 00053 template <typename I> 00054 struct is_utf8_iterator_type 00055 { enum { value = is_utf8_type<typename std::iterator_traits<I>::value_type>::value }; }; 00056 00057 /*************************************************************************************************/ 00058 00059 template <typename I> 00060 struct is_utf16_iterator_type 00061 { enum { value = is_utf16_type<typename std::iterator_traits<I>::value_type>::value }; }; 00062 00063 /*************************************************************************************************/ 00064 00065 template <typename I> 00066 struct is_utf32_iterator_type 00067 { enum { value = is_utf32_type<typename std::iterator_traits<I>::value_type>::value }; }; 00068 00069 /*************************************************************************************************/ 00070 00071 namespace implementation { 00072 00073 /*************************************************************************************************/ 00074 00075 // REVISIT (fbrereto) : I don't need to INIT_ONCE these, do I? 00076 00077 const unsigned char to_utf32_pivot_1_k(128); 00078 const unsigned char to_utf32_pivot_2_k(192); 00079 const unsigned char to_utf32_pivot_3_k(224); 00080 const unsigned char to_utf32_pivot_4_k(240); 00081 const unsigned char to_utf32_pivot_5_k(248); 00082 const unsigned char to_utf32_pivot_6_k(252); 00083 const unsigned char to_utf32_pivot_7_k(254); 00084 00085 const boost::uint32_t to_utf8_pivot_1_k(1UL << 7); 00086 const boost::uint32_t to_utf8_pivot_2_k(1UL << 11); 00087 const boost::uint32_t to_utf8_pivot_3_k(1UL << 16); 00088 const boost::uint32_t to_utf8_pivot_4_k(1UL << 21); 00089 const boost::uint32_t to_utf8_pivot_5_k(1UL << 26); 00090 00091 const boost::uint16_t to_utf16_surrogate_pivot_k(65535); 00092 const boost::uint16_t utf16_high_surrogate_front_k(0xd800); 00093 const boost::uint16_t utf16_high_surrogate_back_k(0xdbff); 00094 const boost::uint16_t utf16_low_surrogate_front_k(0xdc00); 00095 const boost::uint16_t utf16_low_surrogate_back_k(0xdfff); 00096 00097 /*************************************************************************************************/ 00098 /* 00099 NOTE (fbrereto) : The char(...) designations are required on windows, otherwise the MSVC 00100 compiler complains in the utf8_add_mask routines with the following: 00101 00102 "warning C4309: 'specialization' : truncation of constant value" 00103 */ 00104 template <std::size_t NumBytes> struct utf8_header_t { }; 00105 template <> struct utf8_header_t<0> { static const char value = '\x80'; }; // nonheader 00106 //template <> struct utf8_header_t<1> { static const char value = '\x00'; }; // illegal 00107 template <> struct utf8_header_t<2> { static const char value = '\xC0'; }; 00108 template <> struct utf8_header_t<3> { static const char value = '\xE0'; }; 00109 template <> struct utf8_header_t<4> { static const char value = '\xF0'; }; 00110 template <> struct utf8_header_t<5> { static const char value = '\xF8'; }; 00111 template <> struct utf8_header_t<6> { static const char value = '\xFC'; }; 00112 00113 /*************************************************************************************************/ 00114 00115 template <char Mask, typename BinaryInteger> 00116 inline char add_mask(BinaryInteger code) 00117 { return static_cast<char>(code | Mask); } 00118 00119 template <std::size_t NumBytes, bool Header, typename BinaryInteger> 00120 inline char utf8_add_mask(BinaryInteger code) 00121 { return add_mask<utf8_header_t<Header ? NumBytes : 0>::value>(code); } 00122 00123 00124 //MM concept gcc-4.1.1 workaround 00125 inline char utf8_add_mask_0_false(boost::uint32_t code) 00126 { 00127 return utf8_add_mask<0,false>(code); 00128 } 00129 00130 /*************************************************************************************************/ 00131 00132 template<char Mask, typename BinaryInteger> 00133 inline char strip_mask(BinaryInteger code) 00134 { return static_cast<char>(code & ~Mask); } 00135 00136 template <std::size_t NumBytes, bool Header, typename BinaryInteger> 00137 inline char utf8_strip_mask(BinaryInteger code) 00138 { return strip_mask<utf8_header_t<Header ? NumBytes : 0>::value>(code); } 00139 00140 /*************************************************************************************************/ 00141 00142 template <std::size_t Position> 00143 inline boost::uint32_t promote_fragment(char fragment) 00144 { return boost::uint32_t(fragment << ((Position - 1) * 6)); } 00145 00146 template <> 00147 inline boost::uint32_t promote_fragment<1>(char fragment) 00148 { return boost::uint32_t(fragment); } 00149 00150 template <> 00151 inline boost::uint32_t promote_fragment<0>(char); // unimplemented 00152 00153 /*************************************************************************************************/ 00154 00155 template <std::size_t Position> 00156 inline char demote_fragment(boost::uint32_t fragment) 00157 { return char((fragment >> ((Position - 1) * 6)) & 0x0000003F); } 00158 00159 template <> 00160 inline char demote_fragment<1>(boost::uint32_t fragment) 00161 { return char(fragment & 0x0000003F); } 00162 00163 template <> 00164 inline char demote_fragment<0>(boost::uint32_t); // unimplemented 00165 00166 //MM concept gcc-4.1.1 workaround 00167 inline char demote_fragment_1(boost::uint32_t fragment) 00168 { 00169 return demote_fragment<1>(fragment); 00170 } 00171 00172 00173 /*************************************************************************************************/ 00174 00175 template <std::size_t ByteCount, bool Header = true> 00176 struct demotion_engine_t 00177 { 00178 template <typename OutputIterator> 00179 inline OutputIterator operator () (boost::uint32_t code, OutputIterator i) 00180 { 00181 *i = utf8_add_mask<ByteCount, Header>(demote_fragment<ByteCount>(code)); 00182 00183 ++i; 00184 00185 return demotion_engine_t<ByteCount - 1, false>()(code, i); 00186 } 00187 }; 00188 00189 00190 template <> 00191 struct demotion_engine_t<1, false> 00192 { 00193 template <typename OutputIterator> 00194 inline OutputIterator operator () (boost::uint32_t code, OutputIterator i) 00195 { 00196 *i = utf8_add_mask_0_false(demote_fragment_1(code)); 00197 00198 return ++i; 00199 } 00200 }; 00201 00202 /*************************************************************************************************/ 00203 00204 template <std::size_t ByteCount, bool Header = true> 00205 struct promotion_engine_t 00206 { 00207 template <typename InputIterator> 00208 inline boost::uint32_t operator () (InputIterator& first, InputIterator last) 00209 { 00210 /* 00211 CodeWarrior 9.4 doesn't like this code composited into one line; 00212 GCC doesn't seem to have a problem. 00213 */ 00214 00215 char n(*first); 00216 char stripped(utf8_strip_mask<ByteCount, Header>(n)); 00217 boost::uint32_t shifted(promote_fragment<ByteCount>(stripped)); 00218 00219 ++first; 00220 00221 if (first == last) 00222 throw std::runtime_error("unicode: utf32 conversion ran out of input"); 00223 00224 return shifted | promotion_engine_t<ByteCount - 1, false>()(first, last); 00225 } 00226 }; 00227 00228 template <> 00229 struct promotion_engine_t<1, false> 00230 { 00231 template <typename InputIterator> 00232 inline boost::uint32_t operator () (InputIterator& first, InputIterator) 00233 { 00234 boost::uint32_t result(promote_fragment<1>(utf8_strip_mask<0, false>(*first))); 00235 00236 ++first; 00237 00238 return result; 00239 } 00240 }; 00241 00242 /*************************************************************************************************/ 00243 00244 template <typename InputIterator, typename DestInteger> 00245 typename boost::enable_if<is_utf16_iterator_type<InputIterator>, InputIterator>::type 00246 to_utf32 (InputIterator first, InputIterator last, DestInteger& result) 00247 { 00248 if (first == last) return first; 00249 00250 boost::uint16_t code(static_cast<boost::uint16_t>(*first)); 00251 00252 ++first; 00253 00254 if (code >= implementation::utf16_high_surrogate_front_k && 00255 code <= implementation::utf16_high_surrogate_back_k) 00256 { 00257 result = 0; 00258 00259 if (first == last) 00260 throw std::runtime_error("unicode: utf16 high surrogate found without low surrogate"); 00261 00262 boost::uint16_t low(static_cast<boost::uint16_t>(*first)); 00263 00264 assert (low >= implementation::utf16_low_surrogate_front_k && 00265 low <= implementation::utf16_low_surrogate_back_k); 00266 00267 ++first; 00268 00269 result = (code - implementation::utf16_high_surrogate_front_k) * 0x400 + 00270 (low - implementation::utf16_low_surrogate_front_k) + 0x10000; 00271 } 00272 else if (code >= implementation::utf16_low_surrogate_front_k && 00273 code <= implementation::utf16_low_surrogate_back_k) 00274 { throw std::runtime_error("unicode: utf16 low surrogate found without high surrogate"); } 00275 else 00276 { result = static_cast<DestInteger>(code); } 00277 00278 return first; 00279 } 00280 00281 /*************************************************************************************************/ 00282 00283 template <typename InputIterator, typename DestInteger> 00284 typename boost::enable_if<is_utf8_iterator_type<InputIterator>, InputIterator>::type 00285 to_utf32 (InputIterator first, InputIterator last, DestInteger& result) 00286 { 00287 if (first == last) 00288 return first; 00289 00290 unsigned char n(static_cast<unsigned char>(*first)); 00291 00292 if (n < implementation::to_utf32_pivot_1_k) 00293 { result = static_cast<DestInteger>(n); ++first; } 00294 else if (n < implementation::to_utf32_pivot_2_k) 00295 { throw std::runtime_error("unicode: ill-defined utf8 (< 192)"); } 00296 else if (n < implementation::to_utf32_pivot_3_k) 00297 result = implementation::promotion_engine_t<2>()(first, last); 00298 else if (n < implementation::to_utf32_pivot_4_k) 00299 result = implementation::promotion_engine_t<3>()(first, last); 00300 else if (n < implementation::to_utf32_pivot_5_k) 00301 result = implementation::promotion_engine_t<4>()(first, last); 00302 else if (n < implementation::to_utf32_pivot_6_k) 00303 result = implementation::promotion_engine_t<5>()(first, last); 00304 else if (n < implementation::to_utf32_pivot_7_k) 00305 result = implementation::promotion_engine_t<6>()(first, last); 00306 else 00307 { throw std::runtime_error("unicode: ill-defined utf8 (>= 254)"); } 00308 00309 return first; 00310 } 00311 00312 /*************************************************************************************************/ 00313 00314 template <typename InputIterator, typename DestInteger> 00315 typename boost::enable_if<is_utf32_iterator_type<InputIterator>, InputIterator>::type 00316 to_utf32 (InputIterator first, InputIterator last, DestInteger& result) 00317 { 00318 if (first == last) 00319 return first; 00320 00321 result = *first; 00322 00323 return ++first; 00324 } 00325 00326 /*************************************************************************************************/ 00327 00328 } // namespace implementation 00329 00330 /*************************************************************************************************/ 00331 00332 #endif 00333 00334 /*************************************************************************************************/ 00335 /* 00336 utf32 -> utf8 00337 - 1 source value 00338 - n output values 00339 */ 00340 00341 template < typename T, // T models Integer; T must be a valid UTF32-encoded code point 00342 typename O> // O models OutputIterator 00343 typename boost::enable_if<is_utf32_type<T>, O>::type 00344 value_to_utf8(T code, O output) 00345 { 00346 if (code < implementation::to_utf8_pivot_1_k) // UTF-8 is 1 byte long 00347 { *output = static_cast<char>(code); ++output; } 00348 else if (code < implementation::to_utf8_pivot_2_k) // UTF-8 is 2 bytes long 00349 output = implementation::demotion_engine_t<2>()(code, output); 00350 else if (code < implementation::to_utf8_pivot_3_k) // UTF-8 is 3 bytes long 00351 output = implementation::demotion_engine_t<3>()(code, output); 00352 else if (code < implementation::to_utf8_pivot_4_k) // UTF-8 is 4 bytes long 00353 output = implementation::demotion_engine_t<4>()(code, output); 00354 else if (code < implementation::to_utf8_pivot_5_k) // UTF-8 is 5 bytes long 00355 output = implementation::demotion_engine_t<5>()(code, output); 00356 else // UTF-8 is 6 bytes long 00357 output = implementation::demotion_engine_t<6>()(code, output); 00358 00359 return output; 00360 } 00361 00362 /*************************************************************************************************/ 00363 /* 00364 utf16 -> utf8 00365 - 1 source value 00366 - n output values 00367 */ 00368 00369 template < typename T, // T models Integer; T must be a valid UTF16-encoded code point 00370 typename O> // O models OutputIterator 00371 typename boost::enable_if<is_utf16_type<T>, O>::type 00372 value_to_utf8(T code, O output) 00373 { 00374 return value_to_utf8(static_cast<boost::uint32_t>(code), output); 00375 } 00376 00377 /*************************************************************************************************/ 00378 /* 00379 utf8 -> utf8 00380 - 1 source value 00381 - 1 output value 00382 */ 00383 00384 template < typename T, // T models Integer; T must be a valid UTF8-encoded code point 00385 typename O> // O models OutputIterator 00386 typename boost::enable_if<is_utf8_type<T>, O>::type 00387 value_to_utf8(T code, O output) 00388 { 00389 *output++ = code; 00390 00391 return output; 00392 } 00393 00394 /*************************************************************************************************/ 00395 /* 00396 utf16 -> utf8 00397 - n source values 00398 - m output values 00399 */ 00400 00401 template < typename I, // I models InputIterator 00402 typename O> // O models OutputIterator 00403 typename boost::enable_if<is_utf16_iterator_type<I>, O>::type 00404 to_utf8(I first, I last, O output) 00405 { 00406 while (first != last) 00407 { 00408 boost::uint32_t result; 00409 00410 first = implementation::to_utf32(first, last, result); 00411 00412 output = value_to_utf8(result, output); 00413 } 00414 00415 return output; 00416 } 00417 00418 /*************************************************************************************************/ 00419 /* 00420 utf32 -> utf8 00421 - n source values 00422 - m output values 00423 */ 00424 00425 template < typename I, // I models InputIterator 00426 typename O> // O models OutputIterator 00427 typename boost::enable_if<is_utf32_iterator_type<I>, O>::type 00428 to_utf8(I first, I last, O output) 00429 { 00430 if (first == last) return output; 00431 00432 typedef typename std::iterator_traits<I>::value_type value_type; 00433 00434 adobe::for_each(first, last, boost::bind(&value_to_utf8<value_type, O>, _1, boost::ref(output))); 00435 00436 return output; 00437 } 00438 00439 /*************************************************************************************************/ 00440 /* 00441 utf8 -> utf8 00442 - n source values 00443 - m output values 00444 */ 00445 00446 template < typename I, // I models InputIterator 00447 typename O> // O models OutputIterator 00448 typename boost::enable_if<is_utf8_iterator_type<I>, O>::type 00449 to_utf8(I first, I last, O output) 00450 { 00451 return std::copy(first, last, output); 00452 } 00453 00454 /*************************************************************************************************/ 00455 /* 00456 utf32 -> utf16 00457 - 1 source value 00458 - n output values 00459 */ 00460 00461 template < typename T, // T models Integer; sizeof(T) must equal 4; code must be valid utf32 00462 typename O> // O models OutputIterator 00463 typename boost::enable_if<is_utf32_type<T>, O>::type 00464 value_to_utf16(T code, O output) 00465 { 00466 if (code <= implementation::to_utf16_surrogate_pivot_k) 00467 { 00468 *output = static_cast<boost::uint16_t>(code); 00469 } 00470 else 00471 { 00472 *output = static_cast<boost::uint16_t>((code - 0x10000) / 0x400 + implementation::utf16_high_surrogate_front_k); 00473 00474 ++output; 00475 00476 *output = static_cast<boost::uint16_t>((code - 0x10000) % 0x400 + implementation::utf16_low_surrogate_front_k); 00477 } 00478 00479 return ++output; 00480 } 00481 00482 /*************************************************************************************************/ 00483 /* 00484 utf8 -> utf16 00485 - n source values 00486 - m output values 00487 */ 00488 template < typename I, // I models InputIterator 00489 typename O> // O models OutputIterator 00490 typename boost::enable_if<is_utf8_iterator_type<I>, O>::type 00491 to_utf16(I first, I last, O output) 00492 { 00493 while (first != last) 00494 { 00495 boost::uint32_t result; 00496 00497 first = implementation::to_utf32(first, last, result); 00498 00499 output = value_to_utf16(result, output); 00500 } 00501 00502 return output; 00503 } 00504 00505 /*************************************************************************************************/ 00506 /* 00507 utf16 -> utf16 00508 - n source values 00509 - n output values 00510 */ 00511 template < typename I, // I models InputIterator 00512 typename O> // O models OutputIterator 00513 typename boost::enable_if<is_utf16_iterator_type<I>, O>::type 00514 to_utf16(I first, I last, O output) 00515 { 00516 return std::copy(first, last, output); 00517 } 00518 00519 /*************************************************************************************************/ 00520 /* 00521 Precondition: [ first, last ) must convert to exactly one UTF-16 character 00522 */ 00523 00524 template <typename I> 00525 inline typename boost::enable_if<is_utf8_iterator_type<I>, boost::uint16_t>::type 00526 to_utf16(I first, I last) 00527 { 00528 boost::uint32_t result; 00529 00530 implementation::to_utf32(first, last, result); 00531 00532 return static_cast<boost::uint16_t>(result); 00533 } 00534 00535 /*************************************************************************************************/ 00536 /* 00537 utf16 -> utf32 00538 - n source values 00539 - m output values 00540 00541 utf8 -> utf32 00542 - n source values 00543 - m output values 00544 */ 00545 00546 template < typename I, // I models InputIterator 00547 typename O> // O models OutputIterator 00548 O to_utf32(I first, I last, O output) 00549 { 00550 boost::uint32_t result; 00551 00552 while (first != last) 00553 { 00554 first = implementation::to_utf32(first, last, result); 00555 00556 *output = result; 00557 00558 ++output; 00559 } 00560 00561 return output; 00562 } 00563 00564 /*************************************************************************************************/ 00565 /* 00566 Precondition: [ first, last ) must convert to exactly one UTF-32 character 00567 */ 00568 00569 template <typename I> // I models InputIterator 00570 inline boost::uint32_t to_utf32(I first, I last) 00571 { 00572 boost::uint32_t result; 00573 00574 implementation::to_utf32(first, last, result); 00575 00576 return result; 00577 } 00578 00579 /*************************************************************************************************/ 00580 00581 } // namespace adobe 00582 00583 /*************************************************************************************************/ 00584 00585 #endif 00586 00587 /*************************************************************************************************/ |