libstdc++
codecvt_specializations.h
Go to the documentation of this file.
00001 // Locale support (codecvt) -*- C++ -*-
00002 
00003 // Copyright (C) 2000-2015 Free Software Foundation, Inc.
00004 //
00005 // This file is part of the GNU ISO C++ Library.  This library is free
00006 // software; you can redistribute it and/or modify it under the
00007 // terms of the GNU General Public License as published by the
00008 // Free Software Foundation; either version 3, or (at your option)
00009 // any later version.
00010 
00011 // This library is distributed in the hope that it will be useful,
00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014 // GNU General Public License for more details.
00015 
00016 // Under Section 7 of GPL version 3, you are granted additional
00017 // permissions described in the GCC Runtime Library Exception, version
00018 // 3.1, as published by the Free Software Foundation.
00019 
00020 // You should have received a copy of the GNU General Public License and
00021 // a copy of the GCC Runtime Library Exception along with this program;
00022 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
00023 // <http://www.gnu.org/licenses/>.
00024 
00025 //
00026 // ISO C++ 14882: 22.2.1.5 Template class codecvt
00027 //
00028 
00029 // Written by Benjamin Kosnik <bkoz@redhat.com>
00030 
00031 /** @file ext/codecvt_specializations.h
00032  *  This file is a GNU extension to the Standard C++ Library.
00033  */
00034 
00035 #ifndef _EXT_CODECVT_SPECIALIZATIONS_H
00036 #define _EXT_CODECVT_SPECIALIZATIONS_H 1
00037 
00038 #include <bits/c++config.h>
00039 #include <locale>
00040 #include <iconv.h>
00041 
00042 namespace __gnu_cxx _GLIBCXX_VISIBILITY(default)
00043 {
00044 _GLIBCXX_BEGIN_NAMESPACE_CXX11
00045 _GLIBCXX_BEGIN_NAMESPACE_VERSION
00046 
00047   /// Extension to use iconv for dealing with character encodings.
00048   // This includes conversions and comparisons between various character
00049   // sets.  This object encapsulates data that may need to be shared between
00050   // char_traits, codecvt and ctype.
00051   class encoding_state
00052   {
00053   public:
00054     // Types: 
00055     // NB: A conversion descriptor subsumes and enhances the
00056     // functionality of a simple state type such as mbstate_t.
00057     typedef iconv_t     descriptor_type;
00058     
00059   protected:
00060     // Name of internal character set encoding.
00061     std::string         _M_int_enc;
00062 
00063     // Name of external character set encoding.
00064     std::string         _M_ext_enc;
00065 
00066     // Conversion descriptor between external encoding to internal encoding.
00067     descriptor_type     _M_in_desc;
00068 
00069     // Conversion descriptor between internal encoding to external encoding.
00070     descriptor_type     _M_out_desc;
00071 
00072     // The byte-order marker for the external encoding, if necessary.
00073     int                 _M_ext_bom;
00074 
00075     // The byte-order marker for the internal encoding, if necessary.
00076     int                 _M_int_bom;
00077 
00078     // Number of external bytes needed to construct one complete
00079     // character in the internal encoding.
00080     // NB: -1 indicates variable, or stateful, encodings.
00081     int                 _M_bytes;
00082 
00083   public:
00084     explicit 
00085     encoding_state() 
00086     : _M_in_desc(0), _M_out_desc(0), _M_ext_bom(0), _M_int_bom(0), _M_bytes(0)
00087     { }
00088 
00089     explicit 
00090     encoding_state(const char* __int, const char* __ext, 
00091                    int __ibom = 0, int __ebom = 0, int __bytes = 1)
00092     : _M_int_enc(__int), _M_ext_enc(__ext), _M_in_desc(0), _M_out_desc(0), 
00093       _M_ext_bom(__ebom), _M_int_bom(__ibom), _M_bytes(__bytes)
00094     { init(); }
00095 
00096     // 21.1.2 traits typedefs
00097     // p4
00098     // typedef STATE_T state_type
00099     // requires: state_type shall meet the requirements of
00100     // CopyConstructible types (20.1.3)
00101     // NB: This does not preserve the actual state of the conversion
00102     // descriptor member, but it does duplicate the encoding
00103     // information.
00104     encoding_state(const encoding_state& __obj) : _M_in_desc(0), _M_out_desc(0)
00105     { construct(__obj); }
00106 
00107     // Need assignment operator as well.
00108     encoding_state&
00109     operator=(const encoding_state& __obj)
00110     {
00111       construct(__obj);
00112       return *this;
00113     }
00114 
00115     ~encoding_state()
00116     { destroy(); } 
00117 
00118     bool
00119     good() const throw()
00120     { 
00121       const descriptor_type __err = (iconv_t)(-1);
00122       bool __test = _M_in_desc && _M_in_desc != __err; 
00123       __test &=  _M_out_desc && _M_out_desc != __err;
00124       return __test;
00125     }
00126     
00127     int
00128     character_ratio() const
00129     { return _M_bytes; }
00130 
00131     const std::string
00132     internal_encoding() const
00133     { return _M_int_enc; }
00134 
00135     int 
00136     internal_bom() const
00137     { return _M_int_bom; }
00138 
00139     const std::string
00140     external_encoding() const
00141     { return _M_ext_enc; }
00142 
00143     int 
00144     external_bom() const
00145     { return _M_ext_bom; }
00146 
00147     const descriptor_type&
00148     in_descriptor() const
00149     { return _M_in_desc; }
00150 
00151     const descriptor_type&
00152     out_descriptor() const
00153     { return _M_out_desc; }
00154 
00155   protected:
00156     void
00157     init()
00158     {
00159       const descriptor_type __err = (iconv_t)(-1);
00160       const bool __have_encodings = _M_int_enc.size() && _M_ext_enc.size();
00161       if (!_M_in_desc && __have_encodings)
00162         {
00163           _M_in_desc = iconv_open(_M_int_enc.c_str(), _M_ext_enc.c_str());
00164           if (_M_in_desc == __err)
00165             std::__throw_runtime_error(__N("encoding_state::_M_init "
00166                                     "creating iconv input descriptor failed"));
00167         }
00168       if (!_M_out_desc && __have_encodings)
00169         {
00170           _M_out_desc = iconv_open(_M_ext_enc.c_str(), _M_int_enc.c_str());
00171           if (_M_out_desc == __err)
00172             std::__throw_runtime_error(__N("encoding_state::_M_init "
00173                                   "creating iconv output descriptor failed"));
00174         }
00175     }
00176 
00177     void
00178     construct(const encoding_state& __obj)
00179     {
00180       destroy();
00181       _M_int_enc = __obj._M_int_enc;
00182       _M_ext_enc = __obj._M_ext_enc;
00183       _M_ext_bom = __obj._M_ext_bom;
00184       _M_int_bom = __obj._M_int_bom;
00185       _M_bytes = __obj._M_bytes;
00186       init();
00187     }
00188 
00189     void
00190     destroy() throw()
00191     {
00192       const descriptor_type __err = (iconv_t)(-1);
00193       if (_M_in_desc && _M_in_desc != __err) 
00194         {
00195           iconv_close(_M_in_desc);
00196           _M_in_desc = 0;
00197         }
00198       if (_M_out_desc && _M_out_desc != __err) 
00199         {
00200           iconv_close(_M_out_desc);
00201           _M_out_desc = 0;
00202         }
00203     }
00204   };
00205 
00206   /// encoding_char_traits
00207   // Custom traits type with encoding_state for the state type, and the
00208   // associated fpos<encoding_state> for the position type, all other
00209   // bits equivalent to the required char_traits instantiations.
00210   template<typename _CharT>
00211     struct encoding_char_traits
00212     : public std::char_traits<_CharT>
00213     {
00214       typedef encoding_state                            state_type;
00215       typedef typename std::fpos<state_type>            pos_type;
00216     };
00217 
00218 _GLIBCXX_END_NAMESPACE_VERSION
00219 _GLIBCXX_END_NAMESPACE_CXX11
00220 } // namespace
00221 
00222 
00223 namespace std _GLIBCXX_VISIBILITY(default)
00224 {
00225 _GLIBCXX_BEGIN_NAMESPACE_VERSION
00226 
00227   using __gnu_cxx::encoding_state;
00228 
00229   /// codecvt<InternT, _ExternT, encoding_state> specialization.
00230   // This partial specialization takes advantage of iconv to provide
00231   // code conversions between a large number of character encodings.
00232   template<typename _InternT, typename _ExternT>
00233     class codecvt<_InternT, _ExternT, encoding_state>
00234     : public __codecvt_abstract_base<_InternT, _ExternT, encoding_state>
00235     {
00236     public:      
00237       // Types:
00238       typedef codecvt_base::result                      result;
00239       typedef _InternT                                  intern_type;
00240       typedef _ExternT                                  extern_type;
00241       typedef __gnu_cxx::encoding_state                 state_type;
00242       typedef state_type::descriptor_type               descriptor_type;
00243 
00244       // Data Members:
00245       static locale::id                 id;
00246 
00247       explicit 
00248       codecvt(size_t __refs = 0)
00249       : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
00250       { }
00251 
00252       explicit 
00253       codecvt(state_type& __enc, size_t __refs = 0)
00254       : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
00255       { }
00256 
00257      protected:
00258       virtual 
00259       ~codecvt() { }
00260 
00261       virtual result
00262       do_out(state_type& __state, const intern_type* __from, 
00263              const intern_type* __from_end, const intern_type*& __from_next,
00264              extern_type* __to, extern_type* __to_end,
00265              extern_type*& __to_next) const;
00266 
00267       virtual result
00268       do_unshift(state_type& __state, extern_type* __to, 
00269                  extern_type* __to_end, extern_type*& __to_next) const;
00270 
00271       virtual result
00272       do_in(state_type& __state, const extern_type* __from, 
00273             const extern_type* __from_end, const extern_type*& __from_next,
00274             intern_type* __to, intern_type* __to_end, 
00275             intern_type*& __to_next) const;
00276 
00277       virtual int 
00278       do_encoding() const throw();
00279 
00280       virtual bool 
00281       do_always_noconv() const throw();
00282 
00283       virtual int 
00284       do_length(state_type&, const extern_type* __from, 
00285                 const extern_type* __end, size_t __max) const;
00286 
00287       virtual int 
00288       do_max_length() const throw();
00289     };
00290 
00291   template<typename _InternT, typename _ExternT>
00292     locale::id 
00293     codecvt<_InternT, _ExternT, encoding_state>::id;
00294 
00295   // This adaptor works around the signature problems of the second
00296   // argument to iconv():  SUSv2 and others use 'const char**', but glibc 2.2
00297   // uses 'char**', which matches the POSIX 1003.1-2001 standard.
00298   // Using this adaptor, g++ will do the work for us.
00299   template<typename _Tp>
00300     inline size_t
00301     __iconv_adaptor(size_t(*__func)(iconv_t, _Tp, size_t*, char**, size_t*),
00302                     iconv_t __cd, char** __inbuf, size_t* __inbytes,
00303                     char** __outbuf, size_t* __outbytes)
00304     { return __func(__cd, (_Tp)__inbuf, __inbytes, __outbuf, __outbytes); }
00305 
00306   template<typename _InternT, typename _ExternT>
00307     codecvt_base::result
00308     codecvt<_InternT, _ExternT, encoding_state>::
00309     do_out(state_type& __state, const intern_type* __from, 
00310            const intern_type* __from_end, const intern_type*& __from_next,
00311            extern_type* __to, extern_type* __to_end,
00312            extern_type*& __to_next) const
00313     {
00314       result __ret = codecvt_base::error;
00315       if (__state.good())
00316         {
00317           const descriptor_type& __desc = __state.out_descriptor();
00318           const size_t __fmultiple = sizeof(intern_type);
00319           size_t __fbytes = __fmultiple * (__from_end - __from);
00320           const size_t __tmultiple = sizeof(extern_type);
00321           size_t __tbytes = __tmultiple * (__to_end - __to); 
00322           
00323           // Argument list for iconv specifies a byte sequence. Thus,
00324           // all to/from arrays must be brutally casted to char*.
00325           char* __cto = reinterpret_cast<char*>(__to);
00326           char* __cfrom;
00327           size_t __conv;
00328 
00329           // Some encodings need a byte order marker as the first item
00330           // in the byte stream, to designate endian-ness. The default
00331           // value for the byte order marker is NULL, so if this is
00332           // the case, it's not necessary and we can just go on our
00333           // merry way.
00334           int __int_bom = __state.internal_bom();
00335           if (__int_bom)
00336             {     
00337               size_t __size = __from_end - __from;
00338               intern_type* __cfixed = static_cast<intern_type*>
00339                 (__builtin_alloca(sizeof(intern_type) * (__size + 1)));
00340               __cfixed[0] = static_cast<intern_type>(__int_bom);
00341               char_traits<intern_type>::copy(__cfixed + 1, __from, __size);
00342               __cfrom = reinterpret_cast<char*>(__cfixed);
00343               __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
00344                                         &__fbytes, &__cto, &__tbytes); 
00345             }
00346           else
00347             {
00348               intern_type* __cfixed = const_cast<intern_type*>(__from);
00349               __cfrom = reinterpret_cast<char*>(__cfixed);
00350               __conv = __iconv_adaptor(iconv, __desc, &__cfrom, &__fbytes, 
00351                                        &__cto, &__tbytes); 
00352             }
00353 
00354           if (__conv != size_t(-1))
00355             {
00356               __from_next = reinterpret_cast<const intern_type*>(__cfrom);
00357               __to_next = reinterpret_cast<extern_type*>(__cto);
00358               __ret = codecvt_base::ok;
00359             }
00360           else 
00361             {
00362               if (__fbytes < __fmultiple * (__from_end - __from))
00363                 {
00364                   __from_next = reinterpret_cast<const intern_type*>(__cfrom);
00365                   __to_next = reinterpret_cast<extern_type*>(__cto);
00366                   __ret = codecvt_base::partial;
00367                 }
00368               else
00369                 __ret = codecvt_base::error;
00370             }
00371         }
00372       return __ret; 
00373     }
00374 
00375   template<typename _InternT, typename _ExternT>
00376     codecvt_base::result
00377     codecvt<_InternT, _ExternT, encoding_state>::
00378     do_unshift(state_type& __state, extern_type* __to, 
00379                extern_type* __to_end, extern_type*& __to_next) const
00380     {
00381       result __ret = codecvt_base::error;
00382       if (__state.good())
00383         {
00384           const descriptor_type& __desc = __state.in_descriptor();
00385           const size_t __tmultiple = sizeof(intern_type);
00386           size_t __tlen = __tmultiple * (__to_end - __to); 
00387           
00388           // Argument list for iconv specifies a byte sequence. Thus,
00389           // all to/from arrays must be brutally casted to char*.
00390           char* __cto = reinterpret_cast<char*>(__to);
00391           size_t __conv = __iconv_adaptor(iconv,__desc, 0, 0,
00392                                           &__cto, &__tlen); 
00393           
00394           if (__conv != size_t(-1))
00395             {
00396               __to_next = reinterpret_cast<extern_type*>(__cto);
00397               if (__tlen == __tmultiple * (__to_end - __to))
00398                 __ret = codecvt_base::noconv;
00399               else if (__tlen == 0)
00400                 __ret = codecvt_base::ok;
00401               else
00402                 __ret = codecvt_base::partial;
00403             }
00404           else 
00405             __ret = codecvt_base::error;
00406         }
00407       return __ret; 
00408     }
00409    
00410   template<typename _InternT, typename _ExternT>
00411     codecvt_base::result
00412     codecvt<_InternT, _ExternT, encoding_state>::
00413     do_in(state_type& __state, const extern_type* __from, 
00414           const extern_type* __from_end, const extern_type*& __from_next,
00415           intern_type* __to, intern_type* __to_end, 
00416           intern_type*& __to_next) const
00417     { 
00418       result __ret = codecvt_base::error;
00419       if (__state.good())
00420         {
00421           const descriptor_type& __desc = __state.in_descriptor();
00422           const size_t __fmultiple = sizeof(extern_type);
00423           size_t __flen = __fmultiple * (__from_end - __from);
00424           const size_t __tmultiple = sizeof(intern_type);
00425           size_t __tlen = __tmultiple * (__to_end - __to); 
00426           
00427           // Argument list for iconv specifies a byte sequence. Thus,
00428           // all to/from arrays must be brutally casted to char*.
00429           char* __cto = reinterpret_cast<char*>(__to);
00430           char* __cfrom;
00431           size_t __conv;
00432 
00433           // Some encodings need a byte order marker as the first item
00434           // in the byte stream, to designate endian-ness. The default
00435           // value for the byte order marker is NULL, so if this is
00436           // the case, it's not necessary and we can just go on our
00437           // merry way.
00438           int __ext_bom = __state.external_bom();
00439           if (__ext_bom)
00440             {     
00441               size_t __size = __from_end - __from;
00442               extern_type* __cfixed =  static_cast<extern_type*>
00443                 (__builtin_alloca(sizeof(extern_type) * (__size + 1)));
00444               __cfixed[0] = static_cast<extern_type>(__ext_bom);
00445               char_traits<extern_type>::copy(__cfixed + 1, __from, __size);
00446               __cfrom = reinterpret_cast<char*>(__cfixed);
00447               __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
00448                                        &__flen, &__cto, &__tlen); 
00449             }
00450           else
00451             {
00452               extern_type* __cfixed = const_cast<extern_type*>(__from);
00453               __cfrom = reinterpret_cast<char*>(__cfixed);
00454               __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
00455                                        &__flen, &__cto, &__tlen); 
00456             }
00457 
00458           
00459           if (__conv != size_t(-1))
00460             {
00461               __from_next = reinterpret_cast<const extern_type*>(__cfrom);
00462               __to_next = reinterpret_cast<intern_type*>(__cto);
00463               __ret = codecvt_base::ok;
00464             }
00465           else 
00466             {
00467               if (__flen < static_cast<size_t>(__from_end - __from))
00468                 {
00469                   __from_next = reinterpret_cast<const extern_type*>(__cfrom);
00470                   __to_next = reinterpret_cast<intern_type*>(__cto);
00471                   __ret = codecvt_base::partial;
00472                 }
00473               else
00474                 __ret = codecvt_base::error;
00475             }
00476         }
00477       return __ret; 
00478     }
00479   
00480   template<typename _InternT, typename _ExternT>
00481     int 
00482     codecvt<_InternT, _ExternT, encoding_state>::
00483     do_encoding() const throw()
00484     {
00485       int __ret = 0;
00486       if (sizeof(_ExternT) <= sizeof(_InternT))
00487         __ret = sizeof(_InternT) / sizeof(_ExternT);
00488       return __ret; 
00489     }
00490   
00491   template<typename _InternT, typename _ExternT>
00492     bool 
00493     codecvt<_InternT, _ExternT, encoding_state>::
00494     do_always_noconv() const throw()
00495     { return false; }
00496   
00497   template<typename _InternT, typename _ExternT>
00498     int 
00499     codecvt<_InternT, _ExternT, encoding_state>::
00500     do_length(state_type&, const extern_type* __from, 
00501               const extern_type* __end, size_t __max) const
00502     { return std::min(__max, static_cast<size_t>(__end - __from)); }
00503 
00504   // _GLIBCXX_RESOLVE_LIB_DEFECTS
00505   // 74.  Garbled text for codecvt::do_max_length
00506   template<typename _InternT, typename _ExternT>
00507     int 
00508     codecvt<_InternT, _ExternT, encoding_state>::
00509     do_max_length() const throw()
00510     { return 1; }
00511 
00512 _GLIBCXX_END_NAMESPACE_VERSION
00513 } // namespace
00514 
00515 #endif