stlab.adobe.com Adobe Systems Incorporated

xml_parser.hpp

Go to the documentation of this file.
00001 /*
00002     Copyright 2005-2007 Adobe Systems Incorporated
00003     Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt
00004     or a copy at http://stlab.adobe.com/licenses.html)
00005 */
00006 
00007 /*************************************************************************************************/
00008 
00009 #ifndef ADOBE_XML_PARSER_HPP
00010 #define ADOBE_XML_PARSER_HPP
00011 
00012 /*************************************************************************************************/
00013 
00014 #include <adobe/config.hpp>
00015 
00016 #include <adobe/any_regular.hpp>
00017 #include <adobe/algorithm/set.hpp>
00018 #include <adobe/istream.hpp>
00019 #include <adobe/array.hpp>
00020 #include <adobe/copy_on_write.hpp>
00021 #include <adobe/name.hpp>
00022 #include <adobe/dictionary.hpp>
00023 #include <adobe/string.hpp>
00024 #include <adobe/implementation/xml_lex.hpp>
00025 #include <adobe/implementation/xml_token.hpp>
00026 #include <adobe/implementation/parser_shared.hpp>
00027 
00028 #include <boost/function.hpp>
00029 #include <boost/noncopyable.hpp>
00030 #include <boost/operators.hpp>
00031 #include <boost/bind.hpp>
00032 #include <boost/array.hpp>
00033 #include <boost/iterator/iterator_facade.hpp>
00034 
00035 #include <utility>
00036 #include <istream>
00037 #include <sstream>
00038 #include <iomanip>
00039 #include <cassert>
00040 #include <list>
00041 
00042 /*************************************************************************************************/
00043 
00044 namespace adobe {
00045 
00046 /*************************************************************************************************/
00047 
00048 // NOTE (fbrereto) : Class declaration for the documentation is in xml_parser.dox
00049 struct attribute_set_t : public boost::equality_comparable<attribute_set_t>
00050 {
00051     typedef token_range_t                key_type;
00052     typedef token_range_t                mapped_type;
00053     typedef std::pair<key_type, mapped_type>    value_type;
00054     typedef std::vector<value_type>             set_type;
00055     typedef set_type::size_type                 size_type;
00056     typedef set_type::const_iterator            const_iterator;
00057     typedef const_iterator                      iterator;
00058 
00065     struct less_t : std::binary_function<value_type, value_type, bool>
00066     {
00067         bool operator () (const value_type& x, const value_type& y) const
00068         {
00069             return  token_range_less(x.first, y.first) ||
00070                     (!token_range_less(y.first, x.first) &&
00071                     token_range_less(x.second, y.second));
00072         }
00073     };
00074 
00080     struct less_key_only_t : std::binary_function<value_type, value_type, bool>
00081     {
00082         bool operator () (const value_type& x, const value_type& y) const
00083         {
00084             return token_range_less(x.first, y.first);
00085         }
00086     };
00087 
00099     bool lower_bound(const value_type& attribute, set_type::iterator& result)
00100     {
00101         result = adobe::lower_bound(set_m.write(), attribute, less_key_only_t());
00102 
00103         return  result != set_m.write().end() &&
00104                 token_range_equal(result->first, attribute.first);
00105     }
00106 
00118     bool lower_bound(const key_type& key, set_type::iterator& result)
00119     { return lower_bound(value_type(key, mapped_type()), result); }
00120 
00124     bool lower_bound(const value_type& attribute, set_type::const_iterator& result) const
00125     {
00126         result = adobe::lower_bound(*set_m, attribute, less_key_only_t());
00127 
00128         return  result != set_m->end() &&
00129                 token_range_equal(result->first, attribute.first);
00130     }
00131 
00135     bool lower_bound(const key_type& key, set_type::const_iterator& result) const
00136     { return lower_bound(value_type(key, mapped_type()), result); }
00137 
00147     mapped_type operator [] (const key_type& key) const
00148     {
00149         set_type::const_iterator result;
00150 
00151         if (lower_bound(key, result))
00152             return result->second;
00153 
00154         return mapped_type();
00155     }
00156 
00171     attribute_set_t merge(const attribute_set_t& other_set) const
00172     {
00173 
00174         attribute_set_t merged;
00175 
00176         adobe::set_union(*set_m, *other_set.set_m, std::back_inserter(merged.set_m.write()), less_key_only_t());
00177 
00178         return merged;
00179     }
00180 
00192     void insert(const value_type& attribute)
00193     {
00194         set_type::iterator result;
00195 
00196         if (lower_bound(attribute, result))
00197             result->second = attribute.second;
00198         else
00199             set_m.write().insert(result, attribute);
00200     }
00201 
00212     template <typename I> // I models InputIterator
00213     inline void insert(I first, I last)
00214     { for (; first != last; ++first) insert(*first); }
00215 
00224     inline void insert(const key_type& key, const mapped_type& value)
00225     { insert(value_type(key, value)); }
00226 
00237     std::size_t count_same(const attribute_set_t& other_set, bool mapped_matters = true) const;
00238 
00252     bool        has_collisions(const attribute_set_t& other_set) const;
00253 
00263     std::size_t count_collisions(const attribute_set_t& other_set) const;
00264 
00268     inline bool empty() const
00269         { return set_m->empty(); }
00270 
00275     inline size_type size() const
00276         { return set_m->size(); }
00277 
00282     const_iterator begin() const { return set_m->begin(); }
00283 
00288     const_iterator end() const { return set_m->end(); }
00289 
00296     void clear() { set_m.write().clear(); }
00297 
00298 private:
00299     friend bool operator == (const attribute_set_t& x, const attribute_set_t& y);
00300     friend std::ostream& operator << (std::ostream& s, const attribute_set_t& attribute_set);
00301 
00302     copy_on_write<set_type> set_m;
00303 };
00304 
00305 /*************************************************************************************************/
00306 
00319 inline bool operator == (const attribute_set_t& x, const attribute_set_t& y)
00320 {
00321     return x.set_m->size() == y.set_m->size() && x.count_same(y) == x.set_m->size();
00322 }
00323 
00324 /*************************************************************************************************/
00325 
00337 inline std::ostream& operator << (std::ostream& s, const attribute_set_t& attribute_set)
00338 {
00339     attribute_set_t::set_type::const_iterator    first(attribute_set.set_m->begin());
00340     attribute_set_t::set_type::const_iterator    last(attribute_set.set_m->end());
00341     bool                                                not_first(false);
00342 
00343     for (; first != last; ++first)
00344     {
00345         if (not_first)
00346             s << " ";
00347         else
00348             not_first = true;
00349 
00350         adobe::copy(first->first, std::ostream_iterator<char>(s));
00351 
00352         s << "='";
00353 
00354         adobe::copy(first->second, std::ostream_iterator<char>(s));
00355 
00356         s << "'";
00357     }
00358 
00359     return s;
00360 }
00361 
00362 /*************************************************************************************************/
00363 
00364 inline std::size_t attribute_set_t::count_same(const attribute_set_t& other_set, bool mapped_matters) const
00365 {
00366     std::size_t result(0);
00367 
00368     if (mapped_matters)
00369         result = adobe::set_intersection(   *set_m, *other_set.set_m,
00370                                             counting_output_iterator(),
00371                                             less_t())
00372                                     .count();
00373     else
00374         result = adobe::set_intersection(   *set_m, *other_set.set_m,
00375                                             counting_output_iterator(),
00376                                             less_key_only_t())
00377                                     .count();
00378 
00379     #if 0
00380         std::cerr   << "    count_same:\n"
00381                     << "          orig: " << *this << "\n"
00382                     << "          test: " << other_set << "\n"
00383                     << "        result: " << result << std::endl;
00384     #endif
00385 
00386     return result;
00387 }
00388 
00389 /*************************************************************************************************/
00390 
00391 inline bool attribute_set_t::has_collisions(const attribute_set_t& other_set) const
00392 {
00393     attribute_set_t::set_type::const_iterator    first(set_m->begin());
00394     attribute_set_t::set_type::const_iterator    last(set_m->end());
00395 
00396     for (; first != last; ++first)
00397     {
00398         set_type::const_iterator result;
00399 
00400         if (other_set.lower_bound(*first, result) && !token_range_equal(result->second, first->second))
00401             return true;
00402     }
00403 
00404     return false;
00405 }
00406 
00407 /*************************************************************************************************/
00408 
00409 inline std::size_t attribute_set_t::count_collisions(const attribute_set_t& other_set) const
00410 {
00411     attribute_set_t::set_type::const_iterator    first(set_m->begin());
00412     attribute_set_t::set_type::const_iterator    last(set_m->end());
00413     std::size_t                                         collision_count(0);
00414 
00415     for (; first != last; ++first)
00416     {
00417         set_type::const_iterator result;
00418 
00419         if (other_set.lower_bound(*first, result) && result->second != first->second)
00420             ++collision_count;
00421     }
00422 
00423     return collision_count;
00424 }
00425 
00426 /*************************************************************************************************/
00427 
00428 // REVISIT (sparent) : Extra typedef just for the doxygen tool.
00429 
00430 typedef token_range_t (implementation_xml_element_proc_t)(
00431                         const token_range_t&     entire_element_range,
00432                         const token_range_t&     name,
00433                         const attribute_set_t&   attribute_set,
00434                         const token_range_t&     value);
00435 
00436 typedef boost::function<implementation_xml_element_proc_t> xml_element_proc_t;
00437 
00438 /*************************************************************************************************/
00439 
00440 // NOTE (fbrereto) : Class declaration for the documentation is in xml_parser.dox
00441 template <typename O> // O models OutputIterator
00442 class xml_parser_t : public boost::noncopyable
00443 {
00444 public:
00445     typedef xml_element_proc_t                              callback_proc_t;
00446     typedef boost::function<bool (const token_range_t&)>    preorder_predicate_t;
00447     typedef xml_lex_t::token_type                           token_type;
00448 
00449     xml_parser_t(   uchar_ptr_t             first,
00450                     uchar_ptr_t             last,
00451                     const line_position_t&  position,
00452                     preorder_predicate_t    predicate,
00453                     callback_proc_t         callback,
00454                     O                       output) :
00455         pred_m(predicate),
00456         callback_m(callback),
00457         output_m(output),
00458         token_stream_m(first, last, position),
00459         preorder_mode_m(false)
00460     { }
00461 
00462     xml_parser_t(const xml_parser_t& rhs) :
00463         pred_m(rhs.pred_m),
00464         callback_m(rhs.callback_m),
00465         output_m(rhs.output_m),
00466         token_stream_m(rhs.token_stream_m),
00467         preorder_mode_m(rhs.preorder_mode_m)
00468     { }
00469 
00470     xml_parser_t& operator = (const xml_parser_t& rhs)
00471     {
00472         pred_m = rhs.pred_m;
00473         callback_m = rhs.callback_m;
00474         output_m = rhs.output_m;
00475         token_stream_m = rhs.token_stream_m;
00476         preorder_mode_m = rhs.preorder_mode_m;
00477 
00478         return *this;
00479     }
00480 
00481     virtual ~xml_parser_t()
00482     { }
00483 
00484     const line_position_t& next_position()
00485         { return token_stream_m.next_position(); }
00486 
00492     void set_preorder_predicate(preorder_predicate_t pred)
00493     { pred_m = pred; }
00494 
00518     void parse_element_sequence();
00519 
00565     void parse_content();
00566     
00570     void parse_document();
00571 
00572 /*
00573     REVISIT (sparent) : We should provide a protected call to get the token stream and allow
00574     subclasses to access it directly - but for now we'll stick with the law of Demiter.
00575 */
00576 
00577 protected:
00578     const token_type& get_token()
00579         { return token_stream_m.get(); }
00580     void putback()
00581         { token_stream_m.putback(); }
00582 
00583     bool is_token(xml_lex_token_set_t name, token_range_t& value);
00584     bool is_token(xml_lex_token_set_t name);
00585     void require_token(xml_lex_token_set_t name, token_range_t& value);
00586     void require_token(xml_lex_token_set_t name);
00587 
00588     /* REVISIT (sparent) : Should these be const? And is there a way to specify the class to throw? */
00589 
00590     void throw_exception(const char* error_string)
00591         { throw_parser_exception(error_string, next_position()); }
00592     void throw_exception(xml_lex_token_set_t found, xml_lex_token_set_t expected)
00593         { throw_parser_exception(token_to_string(found), token_to_string(expected), next_position()); }
00594 
00595     bool is_element(token_range_t& element);
00596     bool is_content(token_range_t& element);
00597     bool is_e_tag(token_range_t& name, token_range_t& close_tag);
00598     bool is_attribute_set(attribute_set_t& attribute_set);
00599     bool is_attribute(token_range_t& name, token_range_t& value);
00600     bool is_prolog();
00601     bool is_bom(token_range_t& bom);
00602     bool is_xml_decl(token_range_t& xml_decl);
00603 
00604     void    content_callback(   token_range_t&           result_element,
00605                                 const token_range_t&     old_element,
00606                                 const token_range_t&     start_tag,
00607                                 const attribute_set_t    attribute_set,
00608                                 const token_range_t&     content,
00609                                 bool                            preorder_parent);
00610 
00611     preorder_predicate_t    pred_m;
00612     callback_proc_t         callback_m;
00613     O                       output_m;
00614 
00615 private:
00616     xml_lex_t               token_stream_m;
00617     bool                    preorder_mode_m;
00618 };
00619 
00620 /*************************************************************************************************/
00621 
00622 inline token_range_t xml_element_echo(   const token_range_t&     entire_element_range,
00623                                                 const token_range_t&     /*name*/,
00624                                                 const attribute_set_t&   /*attribute_set*/,
00625                                                 const token_range_t&     /*value*/)
00626     { return entire_element_range; }
00627 
00628 /*************************************************************************************************/
00629 
00630 inline token_range_t xml_element_strip(  const token_range_t&     /*entire_element_range*/,
00631                                                 const token_range_t&     /*name*/,
00632                                                 const attribute_set_t&   /*attribute_set*/,
00633                                                 const token_range_t&     value)
00634     { return value; }
00635 
00636 /*************************************************************************************************/
00637 
00638 inline token_range_t xml_element_linefeed(   const token_range_t&     /*entire_element_range*/,
00639                                                     const token_range_t&     name,
00640                                                     const attribute_set_t&   attribute_set,
00641                                                     const token_range_t&     value)
00642 {
00643     if (token_range_equal(name, static_token_range("br")) &&
00644         attribute_set.empty() &&
00645         boost::size(value) == 0)
00646     {
00647 #if ADOBE_PLATFORM_WIN
00648         return static_token_range("&cr;&lf;");
00649 #elif ADOBE_PLATFORM_MAC
00650         return static_token_range("&cr;");
00651 #elif   ADOBE_PLATFORM_UNIX || ADOBE_PLATFORM_LINUX || ADOBE_PLATFORM_BSD || ADOBE_PLATFORM_SOLARIS ||\
00652         ADOBE_PLATFORM_IRIX || ADOBE_PLATFORM_HPUX || ADOBE_PLATFORM_CYGWIN || ADOBE_PLATFORM_AIX
00653         return static_token_range("&lf;");
00654 #else
00655     #error "Line ending for platform unknown - please configure and report the results to stlab.adobe.com"
00656 #endif
00657     }
00658 
00659     return value;
00660 }
00661 
00662 /*************************************************************************************************/
00663 
00664 namespace implementation {
00665 
00666 /*************************************************************************************************/
00667 
00668 token_range_t transform_reference(const token_range_t& reference);
00669 
00670 /*************************************************************************************************/
00671 
00672 } // namespace implementation
00673 
00674 /*************************************************************************************************/
00675 
00676 template <typename O> // O models OutputIterator
00677 bool xml_parser_t<O>::is_token(xml_lex_token_set_t token_name, token_range_t& token_range)
00678 {
00679     const token_type& result(get_token());
00680 
00681     if (result.enum_m == token_name)
00682     {
00683         token_range = result.range_m;
00684 
00685         return true;
00686     }
00687 
00688     putback();
00689 
00690     return false;
00691 }
00692 
00693 /*************************************************************************************************/
00694 
00695 template <typename O> // O models OutputIterator
00696 bool xml_parser_t<O>::is_token(xml_lex_token_set_t token_name)
00697 {
00698     const token_type& result(get_token());
00699 
00700     if (result.enum_m == token_name)
00701         return true;
00702 
00703     putback();
00704 
00705     return false;
00706 }
00707 
00708 /*************************************************************************************************/
00709 
00710 template <typename O> // O models OutputIterator
00711 void xml_parser_t<O>::require_token(xml_lex_token_set_t token_name, token_range_t& token_range)
00712 {
00713     const token_type& result(get_token());
00714 
00715     if (result.enum_m != token_name)
00716         throw_exception(result.enum_m, token_name);
00717 
00718     token_range = result.range_m;
00719 }
00720 
00721 /*************************************************************************************************/
00722 
00723 template <typename O> // O models OutputIterator
00724 void xml_parser_t<O>::require_token(xml_lex_token_set_t token_name)
00725 {
00726     const token_type& result(get_token());
00727 
00728     if (result.enum_m != token_name)
00729         throw_exception(result.enum_m, token_name);
00730 }
00731 
00732 /*************************************************************************************************/
00733 
00734 template <typename O> // O models OutputIterator
00735 void xml_parser_t<O>::content_callback( token_range_t&           result_element,
00736                                         const token_range_t&     old_element,
00737                                         const token_range_t&     start_tag,
00738                                         const attribute_set_t    attribute_set,
00739                                         const token_range_t&     content,
00740                                         bool                            preorder_parent)
00741 {
00742     if (preorder_parent)
00743     {
00744         // if we are in preorder mode and we are the preorder_parent,
00745         // we send the content to the client callback function.
00746         // We get back a single token_range, which we then parse all
00747         // over again in a content parser all its own.
00748 
00749         token_range_t new_content(callback_m(old_element, start_tag, attribute_set, content));
00750 
00751         if (old_element == new_content)
00752         {
00753             // In the case when the new content is the same as the old element,
00754             // the user has opted to echo the element to the output unchanged.
00755 
00756             adobe::copy(old_element, output_m);
00757         }
00758         else
00759         {
00760             // otherwise we need to parse the new content before we can move on to
00761             // the rest of the parse. The new parser has the same predicate and
00762             // output iterator as this one
00763 
00764             xml_parser_t<O>( new_content.first, new_content.second,
00765                                     next_position(), pred_m, callback_m, output_m).parse_content();
00766         }
00767 
00768         // once the token_range from the client has been parsed, we can turn off
00769         // preorder mode and resume parsing the original token stream from where we
00770         // left off.
00771 
00772         preorder_mode_m = false; // only the preorder_parent can turn off preorder mode
00773     }
00774     else
00775     {
00776         // in the case we are in preorder mode but we are not the initiator of
00777         // the mode, we are within the context of another preorder parse. In
00778         // this case we use the entire contents of the element as the token range
00779         // and hand it back as the return value of this function.
00780 
00781         result_element = old_element;
00782     }
00783 }
00784 
00785 /*************************************************************************************************/
00786 
00787 template <typename O> // O models OutputIterator
00788 bool xml_parser_t<O>::is_element(token_range_t& element)
00789 {
00790     element = token_range_t();
00791 
00792     attribute_set_t attribute_set;
00793 
00794     token_range_t   open_tag;
00795     token_range_t   close_tag;
00796 
00797     if (!is_token(xml_token_open_tag_k, open_tag)) return false;
00798 
00799     token_range_t   start_tag;
00800     token_range_t   end_tag;
00801 
00802     require_token(xml_token_name_k, start_tag);
00803 
00804     bool preorder_parent(false); // explained below
00805 
00806     // Preorder mode is a state for the entire parser. In this state the
00807     // client processing callback is never called until the end of the
00808     // current element is found. This precludes the processing of elements
00809     // and other entities nested within this element from being handled until
00810     // this containing element is processed. This is useful in the case when 
00811     // the content of the element could potentially be replaced, in which
00812     // case processing the nested elements first would be a moot point.
00813 
00814     if (!preorder_mode_m && pred_m)
00815     {
00816         // preorder mode is only set when the predicate is defined and
00817         // returns true for the start_tag of this element.
00818 
00819         preorder_mode_m = pred_m(start_tag);
00820 
00821 
00822         // preorder_parent is used to denote which frame in the stack began
00823         // the preorder traversal, as it is this frame alone that can turn
00824         // it back off again.
00825 
00826         preorder_parent = preorder_mode_m;
00827     }
00828 
00829     is_attribute_set(attribute_set);
00830 
00831     if (is_token(xml_token_slash_close_tag_k, close_tag))
00832     {
00833         if (preorder_mode_m)
00834         {
00835             content_callback(   element,
00836                                 token_range_t(open_tag.first, close_tag.second),
00837                                 start_tag,
00838                                 attribute_set,
00839                                 token_range_t(),
00840                                 preorder_parent);
00841         }
00842         else
00843         {
00844             // in the case when we are not in preorder mode at all, we pass the element
00845             // to the client callback and output the token_range we receive back.
00846 
00847             token_range_t result(callback_m( token_range_t(open_tag.first, close_tag.second),
00848                                                     start_tag,
00849                                                     attribute_set,
00850                                                     token_range_t()));
00851 
00852             adobe::copy(result, output_m);
00853         }
00854 
00855         return true;
00856     }
00857 
00858     token_range_t close_of_open_tag;
00859 
00860     require_token(xml_token_close_tag_k, close_of_open_tag);
00861 
00862     token_range_t content;
00863 
00864     // In the case of inorder parsing we want to output the tags
00865     // as we see them; in this case we need to output the opening
00866     // tag before we can go on to the content parsing.
00867 
00868     if (!preorder_mode_m)
00869         std::copy(open_tag.first, close_of_open_tag.second, output_m);
00870 
00871     if (!is_content(content))
00872         throw std::runtime_error("Content expected but not found.");
00873     
00874     if (!is_e_tag(end_tag, close_tag))
00875         throw std::runtime_error("End tag expected but not found.");
00876 
00877     if (!token_range_equal(start_tag, end_tag))
00878         throw std::runtime_error("Start tag and end tag do not have the same name.");
00879 
00880     if (!preorder_mode_m)
00881     {
00882         // in the case when we are not in preorder mode
00883         // we output the content we have immediately,
00884         // then we need to output the closing tag before
00885         // we can go on to the rest of the parse.
00886 
00887         adobe::copy(content, output_m);
00888         adobe::copy(token_range_t(end_tag.first - 2, end_tag.second + 1), output_m);
00889     }
00890     else
00891     {
00892         // In this instance we are continuing a preorder parse...
00893 
00894         content_callback(   element,
00895                             token_range_t(open_tag.first, close_tag.second),
00896                             start_tag,
00897                             attribute_set,
00898                             content,
00899                             preorder_parent);
00900     }
00901 
00902     return true;
00903 }
00904 
00905 /*************************************************************************************************/
00906 
00907 template <typename O> // O models OutputIterator
00908 bool xml_parser_t<O>::is_content(token_range_t& content)
00909 {
00910     content = token_range_t();
00911 
00912     token_range_t char_data;
00913 
00914     // NOTE (fbrereto) :    The content parser can never initiate a preorder mode.
00915     //                      It can only be initiated by the parsing of a preorder
00916     //                      element, which isn't handled here. So for the content
00917     //                      parse we are either in preorder mode or not; we need
00918     //                      not worry about managing it.
00919 
00920     if (is_token(xml_token_char_data_k, char_data))
00921     {
00922         // in the case when we are in preorder mode, we are part of a nested
00923         // content, and we want to use this beginning char_data token as the
00924         // start of the overall content token_range.
00925 
00926         if (preorder_mode_m)
00927             { content = char_data; }
00928 
00929         // in the case when we are not in preorder mode this range of char_data
00930         // needs to be sent directly to the output.
00931 
00932         else
00933             { adobe::copy(char_data, output_m); }
00934     }
00935 
00936     while (true)
00937     {
00938         token_range_t result;
00939 
00940         if (is_token(xml_token_reference_k, result))
00941         {
00942             if (boost::size(result))
00943             {
00944                 if (preorder_mode_m)
00945                 {
00946                     // Again, if we're in preorder mode we're not outputting
00947                     // but extending (possibly even starting, too) the token_range
00948                     // for the preorder element.
00949 
00950                     if (!content.first) content.first = result.first;
00951 
00952                     content.second = result.second;
00953                 }
00954                 else
00955                 {
00956                     // if we're not in preorder mode, we pass the element's
00957                     // reference-transformed token_range result directly to
00958                     // the output.
00959 
00960                     adobe::copy(implementation::transform_reference(result), output_m);
00961                 }
00962             }
00963         }
00964         else if (is_element(result))
00965         {
00966             if (boost::size(result))
00967             {
00968                 if (preorder_mode_m)
00969                 {
00970                     // Again, if we're in preorder mode we're not outputting
00971                     // but extending (possibly even starting, too) the token_range
00972                     // for the preorder element.
00973 
00974                     if (!content.first) content.first = result.first;
00975 
00976                     content.second = result.second;
00977                 }
00978                 else
00979                 {
00980                     // if we're not in preorder mode, we pass the element's
00981                     // token_range result directly to the output.
00982 
00983                     adobe::copy(result, output_m);
00984                 }
00985             }
00986         }
00987         else if (is_token(xml_token_comment_k, result))
00988         {
00989             // Comments are not parsed by any client functions.
00990             // They are merely ignored by the parser.
00991 
00992             // REVISIT eberdahl - Because some clients may want to
00993             // handle comments, we may want to extend the client
00994             // callback system to permit a comment callback.
00995         }
00996         else
00997             { break; }
00998 
00999         if (is_token(xml_token_char_data_k, char_data))
01000         {
01001             // if we find more char_data at the end of the content, we
01002             // either extent the preorder content data or we output
01003             // the contents of the char_data directly to the output (in
01004             // fullorder mode).
01005 
01006             if (preorder_mode_m)
01007                 { content.second = char_data.second; }
01008             else
01009                 { adobe::copy(char_data, output_m); }
01010         }
01011     }
01012 
01013     return true;
01014 }
01015 
01016 /*************************************************************************************************/
01017 
01018 template <typename O> // O models OutputIterator
01019 bool xml_parser_t<O>::is_e_tag(token_range_t& name, token_range_t& close_tag)
01020 {
01021     if (!is_token(xml_token_open_slash_tag_k)) return false;
01022 
01023     require_token(xml_token_name_k, name);
01024 
01025     require_token(xml_token_close_tag_k, close_tag);
01026 
01027     return true;
01028 }
01029 
01030 /*************************************************************************************************/
01031 
01032 template <typename O> // O models OutputIterator
01033 bool xml_parser_t<O>::is_attribute_set(attribute_set_t& attribute_set)
01034 {
01035     token_range_t att_name;
01036     token_range_t att_value;
01037 
01038     while (is_attribute(att_name, att_value))
01039         attribute_set.insert(att_name, att_value);
01040 
01041     return true;
01042 }
01043 
01044 /*************************************************************************************************/
01045 
01046 template <typename O> // O models OutputIterator
01047 bool xml_parser_t<O>::is_prolog()
01048 {
01049     token_range_t bom;
01050     token_range_t xml_decl;
01051 
01052     if (is_bom(bom))
01053     {
01054         // REVISIT eberdahl 2006 Jun 18 - sanity check the bom
01055     }
01056     
01057     if (is_xml_decl(xml_decl))
01058     {
01059         // REVISIT eberdahl 2006 Jun 18 - sanity check the encoding
01060         // of the XMLDecl
01061         
01062         return true;
01063     }
01064     
01065     return false;
01066 }
01067 
01068 /*************************************************************************************************/
01069 
01070 template <typename O> // O models OutputIterator
01071 bool xml_parser_t<O>::is_bom(token_range_t& bom)
01072 {
01073     const token_range_t utf8_bom = static_token_range("\xEF\xBB\xBF");
01074     const token_range_t utf16_be_bom = static_token_range("\xFE\xFF");
01075     const token_range_t utf16_le_bom = static_token_range("\xFF\xFE");
01076 
01077     bool result = false;
01078     
01079     // whitespace skipping should be off when sniffing for a bom
01080     token_stream_m.set_skip_white_space(false);
01081 
01082     if (is_token(xml_token_char_data_k, bom))
01083     {
01084         if (boost::size(utf8_bom) <= boost::size(bom) &&
01085             adobe::equal(utf8_bom, bom.first))
01086         {
01087             bom.second = bom.first;
01088             std::advance(bom.second, boost::size(utf8_bom));
01089 
01090             result = true;
01091         }
01092         else if (boost::size(utf16_be_bom) <= boost::size(bom) &&
01093                  adobe::equal(utf16_be_bom, bom.first))
01094         {
01095             // it's a bom, but it's not a format the parser supports
01096             throw_exception("utf16be bom encountered; xml_parser_t only supports utf8 encoding");           
01097         }
01098         else if (boost::size(utf16_le_bom) <= boost::size(bom) &&
01099                  adobe::equal(utf16_le_bom, bom.first))
01100         {
01101             // it's a bom, but it's not a format the parser supports
01102             throw_exception("utf16le bom encountered; xml_parser_t only supports utf8 encoding");           
01103         }
01104     }
01105 
01106     token_stream_m.set_skip_white_space(true);
01107 
01108     return result;
01109 }
01110 
01111 /*************************************************************************************************/
01112 
01113 template <typename O> // O models OutputIterator
01114 bool xml_parser_t<O>::is_xml_decl(token_range_t& xml_decl)
01115 {
01116     if (is_token(xml_token_processing_instruction_k, xml_decl))
01117     {
01118         // REVISIT eberdahl 2006 Jun 18 - sanity check that the PI
01119         // encountered is, in fact, targeted at the xml application
01120         
01121         return true;
01122     }
01123     
01124     return false;
01125 }
01126 
01127 /*************************************************************************************************/
01128 
01129 template <typename O> // O models OutputIterator
01130 bool xml_parser_t<O>::is_attribute(token_range_t& name, token_range_t& value)
01131 {
01132     if (is_token(xml_token_name_k, name))
01133     {
01134         require_token(xml_token_equals_k);
01135 
01136         require_token(xml_token_att_value_k, value);
01137 
01138         return true;
01139     }
01140 
01141     return false;
01142 }
01143 
01144 /*************************************************************************************************/
01145 
01146 template <typename O> // O models OutputIterator
01147 void xml_parser_t<O>::parse_element_sequence()
01148 {
01149     assert(callback_m);
01150     
01151     token_range_t dummy;
01152 
01153     token_stream_m.set_skip_white_space(false);
01154 
01155     while (is_element(dummy))
01156         is_token(xml_token_char_data_k);
01157 }
01158 
01159 /*************************************************************************************************/
01160 
01161 template <typename O> // O models OutputIterator
01162 void xml_parser_t<O>::parse_content()
01163 {
01164     token_range_t content;
01165 
01166     token_stream_m.set_skip_white_space(false);
01167 
01168     while (true)
01169     {
01170         // always returns true; have to test results
01171         is_content(content);
01172 
01173         if (boost::size(content))
01174         {
01175             token_range_t result(this->callback_m(   content,
01176                                                             token_range_t(),
01177                                                             attribute_set_t(),
01178                                                             content));
01179 
01180             adobe::copy(result, this->output_m);
01181         }
01182         else
01183             break;
01184     }
01185 }
01186 
01187 /*************************************************************************************************/
01188 
01189 template <typename O> // O models OutputIterator
01190 void xml_parser_t<O>::parse_document()
01191 {
01192     token_range_t dummy;
01193     
01194     token_stream_m.set_skip_white_space(true);
01195     
01196     is_prolog();
01197     is_element(dummy);
01198 }
01199 
01200 /*************************************************************************************************/
01201 
01221 template <typename O> // O models OutputIterator
01222 inline xml_parser_t<O> make_xml_parser( uchar_ptr_t                                     first,
01223                                         uchar_ptr_t                                     last,
01224                                         const line_position_t&                          position,
01225                                         typename xml_parser_t<O>::preorder_predicate_t  predicate,
01226                                         typename xml_parser_t<O>::callback_proc_t       callback,
01227                                         O                                               output)
01228 { return xml_parser_t<O>(first, last, position, predicate, callback, output); }
01229 
01230 /*************************************************************************************************/
01236 template <typename Result, typename InputIterator>
01237 InputIterator xatoi(InputIterator first, InputIterator last, Result& result)
01238 {
01239     result = 0;
01240 
01241     while (first != last && std::isxdigit(*first))
01242     {
01243         typename std::iterator_traits<InputIterator>::value_type c(*first);
01244 
01245         result <<= 4;
01246 
01247         if (std::isdigit(c))
01248         {
01249             result += c - '0';
01250         }
01251         else
01252         {
01253             c = std::use_facet<std::ctype<char> >(std::locale()).tolower(c);
01254 
01255             result += c - 'a' + 10;
01256         }
01257 
01258         ++first;
01259     }
01260 
01261     return first;
01262 }
01263 
01264 /*************************************************************************************************/
01270 template <typename Result, typename InputIterator>
01271 InputIterator datoi(InputIterator first, InputIterator last, Result& result)
01272 {
01273     result = 0;
01274 
01275     while (first != last && std::isdigit(*first))
01276     {
01277         result *= 10;
01278 
01279         result += *first - '0';
01280 
01281         ++first;
01282     }
01283 
01284     return first;
01285 }
01286 
01287 /*************************************************************************************************/
01288 
01289 } // namespace adobe
01290 
01291 /*************************************************************************************************/
01292 
01293 #endif
01294 
01295 /*************************************************************************************************/

Copyright © 2006-2007 Adobe Systems Incorporated.

Use of this website signifies your agreement to the Terms of Use and Online Privacy Policy.

Search powered by Google