stlab.adobe.com Adobe Systems Incorporated

unicode.hpp

Go to the documentation of this file.
00001 /*
00002     Copyright 2005-2007 Adobe Systems Incorporated
00003     Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt
00004     or a copy at http://stlab.adobe.com/licenses.html)
00005 */
00006 
00007 /*************************************************************************************************/
00008 
00009 #ifndef ADOBE_UNICODE_HPP
00010 #define ADOBE_UNICODE_HPP
00011 
00012 /*************************************************************************************************/
00013 
00014 #include <adobe/config.hpp>
00015 
00016 #include <adobe/algorithm/for_each.hpp>
00017 
00018 #include <boost/cstdint.hpp>
00019 #include <boost/utility/enable_if.hpp>
00020 
00021 #include <vector>
00022 #include <cassert>
00023 #include <stdexcept>
00024 
00025 /*************************************************************************************************/
00026 
00027 namespace adobe {
00028 
00029 /*************************************************************************************************/
00030 
00031 #if !defined(ADOBE_NO_DOCUMENTATION)
00032 
00033 /*************************************************************************************************/
00034 
00035 template <typename T>
00036 struct is_utf8_type
00037 { enum { value = sizeof(T) == 1 }; };
00038 
00039 /*************************************************************************************************/
00040 
00041 template <typename T>
00042 struct is_utf16_type
00043 { enum { value = sizeof(T) == 2 }; };
00044 
00045 /*************************************************************************************************/
00046 
00047 template <typename T>
00048 struct is_utf32_type
00049 { enum { value = sizeof(T) == 4 }; };
00050 
00051 /*************************************************************************************************/
00052 
00053 template <typename I>
00054 struct is_utf8_iterator_type
00055 { enum { value = is_utf8_type<typename std::iterator_traits<I>::value_type>::value }; };
00056 
00057 /*************************************************************************************************/
00058 
00059 template <typename I>
00060 struct is_utf16_iterator_type
00061 { enum { value = is_utf16_type<typename std::iterator_traits<I>::value_type>::value }; };
00062 
00063 /*************************************************************************************************/
00064 
00065 template <typename I>
00066 struct is_utf32_iterator_type
00067 { enum { value = is_utf32_type<typename std::iterator_traits<I>::value_type>::value }; };
00068 
00069 /*************************************************************************************************/
00070 
00071 namespace implementation {
00072 
00073 /*************************************************************************************************/
00074 
00075 // REVISIT (fbrereto) : I don't need to INIT_ONCE these, do I?
00076 
00077 const unsigned char     to_utf32_pivot_1_k(128);
00078 const unsigned char     to_utf32_pivot_2_k(192);
00079 const unsigned char     to_utf32_pivot_3_k(224);
00080 const unsigned char     to_utf32_pivot_4_k(240);
00081 const unsigned char     to_utf32_pivot_5_k(248);
00082 const unsigned char     to_utf32_pivot_6_k(252);
00083 const unsigned char     to_utf32_pivot_7_k(254);
00084 
00085 const boost::uint32_t   to_utf8_pivot_1_k(1UL << 7);
00086 const boost::uint32_t   to_utf8_pivot_2_k(1UL << 11);
00087 const boost::uint32_t   to_utf8_pivot_3_k(1UL << 16);
00088 const boost::uint32_t   to_utf8_pivot_4_k(1UL << 21);
00089 const boost::uint32_t   to_utf8_pivot_5_k(1UL << 26);
00090 
00091 const boost::uint16_t   to_utf16_surrogate_pivot_k(65535);
00092 const boost::uint16_t   utf16_high_surrogate_front_k(0xd800);
00093 const boost::uint16_t   utf16_high_surrogate_back_k(0xdbff);
00094 const boost::uint16_t   utf16_low_surrogate_front_k(0xdc00);
00095 const boost::uint16_t   utf16_low_surrogate_back_k(0xdfff);
00096 
00097 /*************************************************************************************************/
00098 /*
00099     NOTE (fbrereto) : The char(...) designations are required on windows, otherwise the MSVC
00100                       compiler complains in the utf8_add_mask routines with the following:
00101 
00102                       "warning C4309: 'specialization' : truncation of constant value"
00103 */
00104 template <std::size_t NumBytes> struct utf8_header_t    { };
00105 template <>                     struct utf8_header_t<0> { static const char value = '\x80'; }; // nonheader
00106 //template <>                   struct utf8_header_t<1> { static const char value = '\x00'; }; // illegal
00107 template <>                     struct utf8_header_t<2> { static const char value = '\xC0'; };
00108 template <>                     struct utf8_header_t<3> { static const char value = '\xE0'; };
00109 template <>                     struct utf8_header_t<4> { static const char value = '\xF0'; };
00110 template <>                     struct utf8_header_t<5> { static const char value = '\xF8'; };
00111 template <>                     struct utf8_header_t<6> { static const char value = '\xFC'; };
00112 
00113 /*************************************************************************************************/
00114 
00115 template <char Mask, typename BinaryInteger>
00116 inline char add_mask(BinaryInteger code)
00117 { return static_cast<char>(code | Mask); }
00118 
00119 template <std::size_t NumBytes, bool Header, typename BinaryInteger>
00120 inline char utf8_add_mask(BinaryInteger code)
00121 { return add_mask<utf8_header_t<Header ? NumBytes : 0>::value>(code); }
00122 
00123 
00124 //MM concept gcc-4.1.1 workaround 
00125 inline char utf8_add_mask_0_false(boost::uint32_t code) 
00126 {
00127     return utf8_add_mask<0,false>(code);
00128 }
00129 
00130 /*************************************************************************************************/
00131 
00132 template<char Mask, typename BinaryInteger>
00133 inline char strip_mask(BinaryInteger code)
00134 { return static_cast<char>(code & ~Mask); }
00135 
00136 template <std::size_t NumBytes, bool Header, typename BinaryInteger>
00137 inline char utf8_strip_mask(BinaryInteger code)
00138 { return strip_mask<utf8_header_t<Header ? NumBytes : 0>::value>(code); }
00139 
00140 /*************************************************************************************************/
00141 
00142 template <std::size_t Position>
00143 inline boost::uint32_t promote_fragment(char fragment)
00144 { return boost::uint32_t(fragment << ((Position - 1) * 6)); }
00145 
00146 template <>
00147 inline boost::uint32_t promote_fragment<1>(char fragment)
00148 { return boost::uint32_t(fragment); }
00149 
00150 template <>
00151 inline boost::uint32_t promote_fragment<0>(char); // unimplemented
00152 
00153 /*************************************************************************************************/
00154 
00155 template <std::size_t Position>
00156 inline char demote_fragment(boost::uint32_t fragment)
00157 { return char((fragment >> ((Position - 1) * 6)) & 0x0000003F); }
00158 
00159 template <>
00160 inline char demote_fragment<1>(boost::uint32_t fragment)
00161 { return char(fragment & 0x0000003F); }
00162 
00163 template <>
00164 inline char demote_fragment<0>(boost::uint32_t); // unimplemented
00165 
00166 //MM concept gcc-4.1.1 workaround 
00167 inline char demote_fragment_1(boost::uint32_t fragment) 
00168 {
00169     return demote_fragment<1>(fragment);
00170 }
00171 
00172 
00173 /*************************************************************************************************/
00174 
00175 template <std::size_t ByteCount, bool Header = true>
00176 struct demotion_engine_t
00177 {
00178     template <typename OutputIterator>
00179     inline OutputIterator operator () (boost::uint32_t code, OutputIterator i)
00180     {
00181         *i = utf8_add_mask<ByteCount, Header>(demote_fragment<ByteCount>(code));
00182 
00183         ++i;
00184 
00185         return demotion_engine_t<ByteCount - 1, false>()(code, i);
00186     }
00187 };
00188 
00189 
00190 template <>
00191 struct demotion_engine_t<1, false>
00192 {
00193     template <typename OutputIterator>
00194     inline OutputIterator operator () (boost::uint32_t code, OutputIterator i)
00195     {
00196         *i = utf8_add_mask_0_false(demote_fragment_1(code));
00197 
00198         return ++i;
00199     }
00200 };
00201 
00202 /*************************************************************************************************/
00203 
00204 template <std::size_t ByteCount, bool Header = true>
00205 struct promotion_engine_t
00206 {
00207     template <typename InputIterator>
00208     inline boost::uint32_t operator () (InputIterator& first, InputIterator last)
00209     {
00210         /*
00211             CodeWarrior 9.4 doesn't like this code composited into one line;
00212             GCC doesn't seem to have a problem.
00213         */
00214 
00215         char            n(*first);
00216         char            stripped(utf8_strip_mask<ByteCount, Header>(n));
00217         boost::uint32_t shifted(promote_fragment<ByteCount>(stripped));
00218 
00219         ++first;
00220 
00221         if (first == last)
00222             throw std::runtime_error("unicode: utf32 conversion ran out of input");
00223 
00224         return shifted | promotion_engine_t<ByteCount - 1, false>()(first, last);
00225     }
00226 };
00227 
00228 template <>
00229 struct promotion_engine_t<1, false>
00230 {
00231     template <typename InputIterator>
00232     inline boost::uint32_t operator () (InputIterator& first, InputIterator)
00233     {
00234         boost::uint32_t result(promote_fragment<1>(utf8_strip_mask<0, false>(*first)));
00235 
00236         ++first;
00237 
00238         return result;
00239     }
00240 };
00241 
00242 /*************************************************************************************************/
00243 
00244 template <typename InputIterator, typename DestInteger>
00245 typename boost::enable_if<is_utf16_iterator_type<InputIterator>, InputIterator>::type
00246     to_utf32 (InputIterator first, InputIterator last, DestInteger& result)
00247 {
00248     if (first == last) return first;
00249 
00250     boost::uint16_t code(static_cast<boost::uint16_t>(*first));
00251 
00252     ++first;
00253 
00254     if (code >= implementation::utf16_high_surrogate_front_k &&
00255         code <= implementation::utf16_high_surrogate_back_k)
00256     {
00257         result = 0;
00258 
00259         if (first == last)
00260             throw std::runtime_error("unicode: utf16 high surrogate found without low surrogate"); 
00261 
00262         boost::uint16_t low(static_cast<boost::uint16_t>(*first));
00263 
00264         assert (low >= implementation::utf16_low_surrogate_front_k &&
00265                 low <= implementation::utf16_low_surrogate_back_k);
00266 
00267         ++first;
00268 
00269         result = (code - implementation::utf16_high_surrogate_front_k) * 0x400 +
00270                  (low - implementation::utf16_low_surrogate_front_k) + 0x10000;
00271     }
00272     else if (code >= implementation::utf16_low_surrogate_front_k &&
00273              code <= implementation::utf16_low_surrogate_back_k)
00274         { throw std::runtime_error("unicode: utf16 low surrogate found without high surrogate"); }
00275     else
00276         { result = static_cast<DestInteger>(code); }
00277 
00278     return first;
00279 }
00280 
00281 /*************************************************************************************************/
00282 
00283 template <typename InputIterator, typename DestInteger>
00284 typename boost::enable_if<is_utf8_iterator_type<InputIterator>, InputIterator>::type
00285     to_utf32 (InputIterator first, InputIterator last, DestInteger& result)
00286 {
00287     if (first == last)
00288         return first;
00289 
00290     unsigned char n(static_cast<unsigned char>(*first));
00291 
00292     if (n < implementation::to_utf32_pivot_1_k)
00293         { result = static_cast<DestInteger>(n); ++first; }
00294     else if (n < implementation::to_utf32_pivot_2_k)
00295         { throw std::runtime_error("unicode: ill-defined utf8 (< 192)"); }
00296     else if (n < implementation::to_utf32_pivot_3_k)
00297         result = implementation::promotion_engine_t<2>()(first, last);
00298     else if (n < implementation::to_utf32_pivot_4_k)
00299         result = implementation::promotion_engine_t<3>()(first, last);
00300     else if (n < implementation::to_utf32_pivot_5_k)
00301         result = implementation::promotion_engine_t<4>()(first, last);
00302     else if (n < implementation::to_utf32_pivot_6_k)
00303         result = implementation::promotion_engine_t<5>()(first, last);
00304     else if (n < implementation::to_utf32_pivot_7_k)
00305         result = implementation::promotion_engine_t<6>()(first, last);
00306     else
00307         { throw std::runtime_error("unicode: ill-defined utf8 (>= 254)"); }
00308 
00309     return first;
00310 }
00311 
00312 /*************************************************************************************************/
00313 
00314 template <typename InputIterator, typename DestInteger>
00315 typename boost::enable_if<is_utf32_iterator_type<InputIterator>, InputIterator>::type
00316     to_utf32 (InputIterator first, InputIterator last, DestInteger& result)
00317 {
00318     if (first == last)
00319         return first;
00320 
00321     result = *first;
00322 
00323     return ++first;
00324 }
00325 
00326 /*************************************************************************************************/
00327 
00328 } // namespace implementation
00329 
00330 /*************************************************************************************************/
00331 
00332 #endif
00333 
00334 /*************************************************************************************************/
00335 /*
00336         utf32 -> utf8
00337             - 1 source value
00338             - n output values
00339 */
00340 
00341 template <  typename T, // T models Integer; T must be a valid UTF32-encoded code point
00342             typename O> // O models OutputIterator
00343 typename boost::enable_if<is_utf32_type<T>, O>::type
00344     value_to_utf8(T code, O output)
00345 {
00346     if (code < implementation::to_utf8_pivot_1_k) // UTF-8 is 1 byte long
00347         { *output = static_cast<char>(code); ++output; }
00348     else if (code < implementation::to_utf8_pivot_2_k) // UTF-8 is 2 bytes long
00349         output = implementation::demotion_engine_t<2>()(code, output);
00350     else if (code < implementation::to_utf8_pivot_3_k) // UTF-8 is 3 bytes long
00351         output = implementation::demotion_engine_t<3>()(code, output);
00352     else if (code < implementation::to_utf8_pivot_4_k) // UTF-8 is 4 bytes long
00353         output = implementation::demotion_engine_t<4>()(code, output);
00354     else if (code < implementation::to_utf8_pivot_5_k) // UTF-8 is 5 bytes long
00355         output = implementation::demotion_engine_t<5>()(code, output);
00356     else // UTF-8 is 6 bytes long
00357         output = implementation::demotion_engine_t<6>()(code, output);
00358 
00359     return output;
00360 }
00361 
00362 /*************************************************************************************************/
00363 /*
00364         utf16 -> utf8
00365             - 1 source value
00366             - n output values
00367 */
00368 
00369 template <  typename T, // T models Integer; T must be a valid UTF16-encoded code point
00370             typename O> // O models OutputIterator
00371 typename boost::enable_if<is_utf16_type<T>, O>::type
00372     value_to_utf8(T code, O output)
00373 {
00374     return value_to_utf8(static_cast<boost::uint32_t>(code), output);
00375 }
00376 
00377 /*************************************************************************************************/
00378 /*
00379         utf8 -> utf8
00380             - 1 source value
00381             - 1 output value
00382 */
00383 
00384 template <  typename T, // T models Integer; T must be a valid UTF8-encoded code point
00385             typename O> // O models OutputIterator
00386 typename boost::enable_if<is_utf8_type<T>, O>::type
00387     value_to_utf8(T code, O output)
00388 {
00389     *output++ = code;
00390 
00391     return output;
00392 }
00393 
00394 /*************************************************************************************************/
00395 /*
00396         utf16 -> utf8
00397             - n source values
00398             - m output values
00399 */
00400 
00401 template <  typename I, // I models InputIterator
00402             typename O> // O models OutputIterator
00403 typename boost::enable_if<is_utf16_iterator_type<I>, O>::type
00404     to_utf8(I first, I last, O output)
00405 {
00406     while (first != last)
00407     {
00408         boost::uint32_t result;
00409 
00410         first = implementation::to_utf32(first, last, result);
00411 
00412         output = value_to_utf8(result, output);
00413     }
00414 
00415     return output;
00416 }
00417 
00418 /*************************************************************************************************/
00419 /*
00420         utf32 -> utf8
00421             - n source values
00422             - m output values
00423 */
00424 
00425 template <  typename I, // I models InputIterator
00426             typename O> // O models OutputIterator
00427 typename boost::enable_if<is_utf32_iterator_type<I>, O>::type
00428     to_utf8(I first, I last, O output)
00429 {
00430     if (first == last) return output;
00431 
00432     typedef typename std::iterator_traits<I>::value_type value_type;
00433 
00434     adobe::for_each(first, last, boost::bind(&value_to_utf8<value_type, O>, _1, boost::ref(output)));
00435 
00436     return output;
00437 }
00438 
00439 /*************************************************************************************************/
00440 /*
00441         utf8 -> utf8
00442             - n source values
00443             - m output values
00444 */
00445 
00446 template <  typename I, // I models InputIterator
00447             typename O> // O models OutputIterator
00448 typename boost::enable_if<is_utf8_iterator_type<I>, O>::type
00449     to_utf8(I first, I last, O output)
00450 {
00451     return std::copy(first, last, output);
00452 }
00453 
00454 /*************************************************************************************************/
00455 /*
00456         utf32 -> utf16
00457             - 1 source value
00458             - n output values
00459 */
00460 
00461 template <  typename T, // T models Integer; sizeof(T) must equal 4; code must be valid utf32
00462             typename O> // O models OutputIterator
00463 typename boost::enable_if<is_utf32_type<T>, O>::type
00464     value_to_utf16(T code, O output)
00465 {
00466     if (code <= implementation::to_utf16_surrogate_pivot_k)
00467     {
00468         *output = static_cast<boost::uint16_t>(code);
00469     }
00470     else
00471     {
00472         *output = static_cast<boost::uint16_t>((code - 0x10000) / 0x400 + implementation::utf16_high_surrogate_front_k);
00473 
00474         ++output;
00475 
00476         *output = static_cast<boost::uint16_t>((code - 0x10000) % 0x400 + implementation::utf16_low_surrogate_front_k);
00477     }
00478 
00479     return ++output;
00480 }
00481 
00482 /*************************************************************************************************/
00483 /*
00484         utf8 -> utf16
00485             - n source values
00486             - m output values
00487 */
00488 template <  typename I, // I models InputIterator
00489             typename O> // O models OutputIterator
00490 typename boost::enable_if<is_utf8_iterator_type<I>, O>::type
00491     to_utf16(I first, I last, O output)
00492 {
00493     while (first != last)
00494     {
00495         boost::uint32_t result;
00496 
00497         first = implementation::to_utf32(first, last, result);
00498 
00499         output = value_to_utf16(result, output);
00500     }
00501 
00502     return output;
00503 }
00504 
00505 /*************************************************************************************************/
00506 /*
00507         utf16 -> utf16
00508             - n source values
00509             - n output values
00510 */
00511 template <  typename I, // I models InputIterator
00512             typename O> // O models OutputIterator
00513 typename boost::enable_if<is_utf16_iterator_type<I>, O>::type
00514     to_utf16(I first, I last, O output)
00515 {
00516     return std::copy(first, last, output);
00517 }
00518 
00519 /*************************************************************************************************/
00520 /*
00521     Precondition: [ first, last ) must convert to exactly one UTF-16 character
00522 */
00523 
00524 template <typename I>
00525 inline typename boost::enable_if<is_utf8_iterator_type<I>, boost::uint16_t>::type
00526     to_utf16(I first, I last)
00527 {
00528     boost::uint32_t result;
00529 
00530     implementation::to_utf32(first, last, result);
00531 
00532     return static_cast<boost::uint16_t>(result);
00533 }
00534 
00535 /*************************************************************************************************/
00536 /*
00537         utf16 -> utf32
00538             - n source values
00539             - m output values
00540 
00541         utf8 -> utf32
00542             - n source values
00543             - m output values
00544 */
00545 
00546 template <  typename I, // I models InputIterator
00547             typename O> // O models OutputIterator
00548 O to_utf32(I first, I last, O output)
00549 {
00550     boost::uint32_t result;
00551 
00552     while (first != last)
00553     {
00554         first = implementation::to_utf32(first, last, result);
00555 
00556         *output = result;
00557 
00558         ++output;
00559     }
00560 
00561     return output;
00562 }
00563 
00564 /*************************************************************************************************/
00565 /*
00566     Precondition: [ first, last ) must convert to exactly one UTF-32 character
00567 */
00568 
00569 template <typename I> // I models InputIterator
00570 inline boost::uint32_t to_utf32(I first, I last)
00571 {
00572     boost::uint32_t result;
00573 
00574     implementation::to_utf32(first, last, result);
00575 
00576     return result;
00577 }
00578 
00579 /*************************************************************************************************/
00580 
00581 } // namespace adobe
00582 
00583 /*************************************************************************************************/
00584 
00585 #endif
00586     
00587 /*************************************************************************************************/

Copyright © 2006-2007 Adobe Systems Incorporated.

Use of this website signifies your agreement to the Terms of Use and Online Privacy Policy.

Search powered by Google