Adobe Software Technology Lab: xml_parser.hpp Source File

Documentation

Overview
Building ASL
Documentation
Library Wiki Docs
Indices
Browse Perforce
More Info

Release Notes
Wiki
Site Search
License
Success Stories
Contributors
Media

Download
Perforce Depots
Other Resources

Boost
RIAForge
SGI STL
Go to the documentation of this file.
00001 /*
00002     Copyright 2005-2007 Adobe Systems Incorporated
00003     Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt
00004     or a copy at http://stlab.adobe.com/licenses.html)
00005 */
00006 
00007 /*************************************************************************************************/
00008 
00009 #ifndef ADOBE_XML_PARSER_HPP
00010 #define ADOBE_XML_PARSER_HPP
00011 
00012 /*************************************************************************************************/
00013 
00014 #include <adobe/config.hpp>
00015 
00016 #include <adobe/any_regular.hpp>
00017 #include <adobe/algorithm/set.hpp>
00018 #include <adobe/istream.hpp>
00019 #include <adobe/array.hpp>
00020 #include <adobe/copy_on_write.hpp>
00021 #include <adobe/name.hpp>
00022 #include <adobe/dictionary.hpp>
00023 #include <adobe/string.hpp>
00024 #include <adobe/implementation/xml_lex.hpp>
00025 #include <adobe/implementation/xml_token.hpp>
00026 #include <adobe/implementation/parser_shared.hpp>
00027 
00028 #include <boost/function.hpp>
00029 #include <boost/noncopyable.hpp>
00030 #include <boost/operators.hpp>
00031 #include <boost/bind.hpp>
00032 #include <boost/array.hpp>
00033 #include <boost/iterator/iterator_facade.hpp>
00034 
00035 #include <utility>
00036 #include <istream>
00037 #include <sstream>
00038 #include <iomanip>
00039 #include <cassert>
00040 #include <list>
00041 
00042 /*************************************************************************************************/
00043 
00044 namespace adobe {
00045 
00046 /*************************************************************************************************/
00047 
00048 // NOTE (fbrereto) : Class declaration for the documentation is in xml_parser.dox
00049 struct attribute_set_t : public boost::equality_comparable<attribute_set_t>
00050 {
00051     typedef token_range_t                key_type;
00052     typedef token_range_t                mapped_type;
00053     typedef std::pair<key_type, mapped_type>    value_type;
00054     typedef std::vector<value_type>             set_type;
00055     typedef set_type::size_type                 size_type;
00056     typedef set_type::const_iterator            const_iterator;
00057     typedef const_iterator                      iterator;
00058 
00065     struct less_t : std::binary_function<value_type, value_type, bool>
00066     {
00067         bool operator () (const value_type& x, const value_type& y) const
00068         {
00069             return  token_range_less(x.first, y.first) ||
00070                     (!token_range_less(y.first, x.first) &&
00071                     token_range_less(x.second, y.second));
00072         }
00073     };
00074 
00080     struct less_key_only_t : std::binary_function<value_type, value_type, bool>
00081     {
00082         bool operator () (const value_type& x, const value_type& y) const
00083         {
00084             return token_range_less(x.first, y.first);
00085         }
00086     };
00087 
00099     bool lower_bound(const value_type& attribute, set_type::iterator& result)
00100     {
00101         result = adobe::lower_bound(set_m.write(), attribute, less_key_only_t());
00102 
00103         return  result != set_m.write().end() &&
00104                 token_range_equal(result->first, attribute.first);
00105     }
00106 
00118     bool lower_bound(const key_type& key, set_type::iterator& result)
00119     { return lower_bound(value_type(key, mapped_type()), result); }
00120 
00124     bool lower_bound(const value_type& attribute, set_type::const_iterator& result) const
00125     {
00126         result = adobe::lower_bound(*set_m, attribute, less_key_only_t());
00127 
00128         return  result != set_m->end() &&
00129                 token_range_equal(result->first, attribute.first);
00130     }
00131 
00135     bool lower_bound(const key_type& key, set_type::const_iterator& result) const
00136     { return lower_bound(value_type(key, mapped_type()), result); }
00137 
00147     mapped_type operator [] (const key_type& key) const
00148     {
00149         set_type::const_iterator result;
00150 
00151         if (lower_bound(key, result))
00152             return result->second;
00153 
00154         return mapped_type();
00155     }
00156 
00171     attribute_set_t merge(const attribute_set_t& other_set) const
00172     {
00173 
00174         attribute_set_t merged;
00175 
00176         adobe::set_union(*set_m, *other_set.set_m, std::back_inserter(merged.set_m.write()), less_key_only_t());
00177 
00178         return merged;
00179     }
00180 
00192     void insert(const value_type& attribute)
00193     {
00194         set_type::iterator result;
00195 
00196         if (lower_bound(attribute, result))
00197             result->second = attribute.second;
00198         else
00199             set_m.write().insert(result, attribute);
00200     }
00201 
00212     template <typename I> // I models InputIterator
00213     inline void insert(I first, I last)
00214     { for (; first != last; ++first) insert(*first); }
00215 
00224     inline void insert(const key_type& key, const mapped_type& value)
00225     { insert(value_type(key, value)); }
00226 
00237     std::size_t count_same(const attribute_set_t& other_set, bool mapped_matters = true) const;
00238 
00252     bool        has_collisions(const attribute_set_t& other_set) const;
00253 
00263     std::size_t count_collisions(const attribute_set_t& other_set) const;
00264 
00268     inline bool empty() const
00269         { return set_m->empty(); }
00270 
00275     inline size_type size() const
00276         { return set_m->size(); }
00277 
00282     const_iterator begin() const { return set_m->begin(); }
00283 
00288     const_iterator end() const { return set_m->end(); }
00289 
00296     void clear() { set_m.write().clear(); }
00297 
00298 private:
00299     friend bool operator == (const attribute_set_t& x, const attribute_set_t& y);
00300     friend std::ostream& operator << (std::ostream& s, const attribute_set_t& attribute_set);
00301 
00302     copy_on_write<set_type> set_m;
00303 };
00304 
00305 /*************************************************************************************************/
00306 
00319 inline bool operator == (const attribute_set_t& x, const attribute_set_t& y)
00320 {
00321     return x.set_m->size() == y.set_m->size() && x.count_same(y) == x.set_m->size();
00322 }
00323 
00324 /*************************************************************************************************/
00325 
00337 inline std::ostream& operator << (std::ostream& s, const attribute_set_t& attribute_set)
00338 {
00339     attribute_set_t::set_type::const_iterator    first(attribute_set.set_m->begin());
00340     attribute_set_t::set_type::const_iterator    last(attribute_set.set_m->end());
00341     bool                                                not_first(false);
00342 
00343     for (; first != last; ++first)
00344     {
00345         if (not_first)
00346             s << " ";
00347         else
00348             not_first = true;
00349 
00350         adobe::copy(first->first, std::ostream_iterator<char>(s));
00351 
00352         s << "='";
00353 
00354         adobe::copy(first->second, std::ostream_iterator<char>(s));
00355 
00356         s << "'";
00357     }
00358 
00359     return s;
00360 }
00361 
00362 /*************************************************************************************************/
00363 
00364 inline std::size_t attribute_set_t::count_same(const attribute_set_t& other_set, bool mapped_matters) const
00365 {
00366     std::size_t result(0);
00367 
00368     if (mapped_matters)
00369         result = adobe::set_intersection(   *set_m, *other_set.set_m,
00370                                             counting_output_iterator(),
00371                                             less_t())
00372                                     .count();
00373     else
00374         result = adobe::set_intersection(   *set_m, *other_set.set_m,
00375                                             counting_output_iterator(),
00376                                             less_key_only_t())
00377                                     .count();
00378 
00379     #if 0
00380         std::cerr   << "    count_same:\n"
00381                     << "          orig: " << *this << "\n"
00382                     << "          test: " << other_set << "\n"
00383                     << "        result: " << result << std::endl;
00384     #endif
00385 
00386     return result;
00387 }
00388 
00389 /*************************************************************************************************/
00390 
00391 inline bool attribute_set_t::has_collisions(const attribute_set_t& other_set) const
00392 {
00393     attribute_set_t::set_type::const_iterator    first(set_m->begin());
00394     attribute_set_t::set_type::const_iterator    last(set_m->end());
00395 
00396     for (; first != last; ++first)
00397     {
00398         set_type::const_iterator result;
00399 
00400         if (other_set.lower_bound(*first, result) && !token_range_equal(result->second, first->second))
00401             return true;
00402     }
00403 
00404     return false;
00405 }
00406 
00407 /*************************************************************************************************/
00408 
00409 inline std::size_t attribute_set_t::count_collisions(const attribute_set_t& other_set) const
00410 {
00411     attribute_set_t::set_type::const_iterator    first(set_m->begin());
00412     attribute_set_t::set_type::const_iterator    last(set_m->end());
00413     std::size_t                                         collision_count(0);
00414 
00415     for (; first != last; ++first)
00416     {
00417         set_type::const_iterator result;
00418 
00419         if (other_set.lower_bound(*first, result) && result->second != first->second)
00420             ++collision_count;
00421     }
00422 
00423     return collision_count;
00424 }
00425 
00426 /*************************************************************************************************/
00427 
00428 // REVISIT (sparent) : Extra typedef just for the doxygen tool.
00429 
00430 typedef token_range_t (implementation_xml_element_proc_t)(
00431                         const token_range_t&     entire_element_range,
00432                         const token_range_t&     name,
00433                         const attribute_set_t&   attribute_set,
00434                         const token_range_t&     value);
00435 
00436 typedef boost::function<implementation_xml_element_proc_t> xml_element_proc_t;
00437 
00438 /*************************************************************************************************/
00439 
00440 // NOTE (fbrereto) : Class declaration for the documentation is in xml_parser.dox
00441 template <typename O> // O models OutputIterator
00442 class xml_parser_t : public boost::noncopyable
00443 {
00444 public:
00445     typedef xml_element_proc_t                              callback_proc_t;
00446     typedef boost::function<bool (const token_range_t&)>    preorder_predicate_t;
00447     typedef xml_lex_t::token_type                           token_type;
00448 
00449     xml_parser_t(   uchar_ptr_t             first,
00450                     uchar_ptr_t             last,
00451                     const line_position_t&  position,
00452                     preorder_predicate_t    predicate,
00453                     callback_proc_t         callback,
00454                     O                       output) :
00455         pred_m(predicate),
00456         callback_m(callback),
00457         output_m(output),
00458         token_stream_m(first, last, position),
00459         preorder_mode_m(false)
00460     { }
00461 
00462     xml_parser_t(const xml_parser_t& rhs) :
00463         pred_m(rhs.pred_m),
00464         callback_m(rhs.callback_m),
00465         output_m(rhs.output_m),
00466         token_stream_m(rhs.token_stream_m),
00467         preorder_mode_m(rhs.preorder_mode_m)
00468     { }
00469 
00470     xml_parser_t& operator = (const xml_parser_t& rhs)
00471     {
00472         pred_m = rhs.pred_m;
00473         callback_m = rhs.callback_m;
00474         output_m = rhs.output_m;
00475         token_stream_m = rhs.token_stream_m;
00476         preorder_mode_m = rhs.preorder_mode_m;
00477 
00478         return *this;
00479     }
00480 
00481     virtual ~xml_parser_t()
00482     { }
00483 
00484     const line_position_t& next_position()
00485         { return token_stream_m.next_position(); }
00486 
00492     void set_preorder_predicate(preorder_predicate_t pred)
00493     { pred_m = pred; }
00494 
00518     void parse_element_sequence();
00519 
00565     void parse_content();
00566     
00570     void parse_document();
00571 
00572 /*
00573     REVISIT (sparent) : We should provide a protected call to get the token stream and allow
00574     subclasses to access it directly - but for now we'll stick with the law of Demiter.
00575 */
00576 
00577 protected:
00578     const token_type& get_token()
00579         { return token_stream_m.get(); }
00580     void putback()
00581         { token_stream_m.putback(); }
00582 
00583     bool is_token(xml_lex_token_set_t name, token_range_t& value);
00584     bool is_token(xml_lex_token_set_t name);
00585     void require_token(xml_lex_token_set_t name, token_range_t& value);
00586     void require_token(xml_lex_token_set_t name);
00587 
00588     /* REVISIT (sparent) : Should these be const? And is there a way to specify the class to throw? */
00589 
00590     void throw_exception(const char* error_string)
00591         { throw_parser_exception(error_string, next_position()); }
00592     void throw_exception(xml_lex_token_set_t found, xml_lex_token_set_t expected)
00593         { throw_parser_exception(token_to_string(found), token_to_string(expected), next_position()); }
00594 
00595     bool is_element(token_range_t& element);
00596     bool is_content(token_range_t& element);
00597     bool is_e_tag(token_range_t& name, token_range_t& close_tag);
00598     bool is_attribute_set(attribute_set_t& attribute_set);
00599     bool is_attribute(token_range_t& name, token_range_t& value);
00600     bool is_prolog();
00601     bool is_bom(token_range_t& bom);
00602     bool is_xml_decl(token_range_t& xml_decl);
00603 
00604     void    content_callback(   token_range_t&           result_element,
00605                                 const token_range_t&     old_element,
00606                                 const token_range_t&     start_tag,
00607                                 const attribute_set_t    attribute_set,
00608                                 const token_range_t&     content,
00609                                 bool                            preorder_parent);
00610 
00611     preorder_predicate_t    pred_m;
00612     callback_proc_t         callback_m;
00613     O                       output_m;
00614 
00615 private:
00616     xml_lex_t               token_stream_m;
00617     bool                    preorder_mode_m;
00618 };
00619 
00620 /*************************************************************************************************/
00621 
00622 inline token_range_t xml_element_echo(   const token_range_t&     entire_element_range,
00623                                                 const token_range_t&     /*name*/,
00624                                                 const attribute_set_t&   /*attribute_set*/,
00625                                                 const token_range_t&     /*value*/)
00626     { return entire_element_range; }
00627 
00628 /*************************************************************************************************/
00629 
00630 inline token_range_t xml_element_strip(  const token_range_t&     /*entire_element_range*/,
00631                                                 const token_range_t&     /*name*/,
00632                                                 const attribute_set_t&   /*attribute_set*/,
00633                                                 const token_range_t&     value)
00634     { return value; }
00635 
00636 /*************************************************************************************************/
00637 
00638 inline token_range_t xml_element_linefeed(   const token_range_t&     /*entire_element_range*/,
00639                                                     const token_range_t&     name,
00640                                                     const attribute_set_t&   attribute_set,
00641                                                     const token_range_t&     value)
00642 {
00643     if (token_range_equal(name, static_token_range("br")) &&
00644         attribute_set.empty() &&
00645         boost::size(value) == 0)
00646     {
00647 #if ADOBE_PLATFORM_WIN
00648         return static_token_range("&cr;&lf;");
00649 #elif ADOBE_PLATFORM_MAC
00650         return static_token_range("&cr;");
00651 #elif   ADOBE_PLATFORM_UNIX || ADOBE_PLATFORM_LINUX || ADOBE_PLATFORM_BSD || ADOBE_PLATFORM_SOLARIS ||\
00652         ADOBE_PLATFORM_IRIX || ADOBE_PLATFORM_HPUX || ADOBE_PLATFORM_CYGWIN || ADOBE_PLATFORM_AIX
00653         return static_token_range("&lf;");
00654 #else
00655     #error "Line ending for platform unknown - please configure and report the results to stlab.adobe.com"
00656 #endif
00657     }
00658 
00659     return value;
00660 }
00661 
00662 /*************************************************************************************************/
00663 
00664 namespace implementation {
00665 
00666 /*************************************************************************************************/
00667 
00668 token_range_t transform_reference(const token_range_t& reference);
00669 
00670 /*************************************************************************************************/
00671 
00672 } // namespace implementation
00673 
00674 /*************************************************************************************************/
00675 
00676 template <typename O> // O models OutputIterator
00677 bool xml_parser_t<O>::is_token(xml_lex_token_set_t token_name, token_range_t& token_range)
00678 {
00679     const token_type& result(get_token());
00680 
00681     if (result.enum_m == token_name)
00682     {
00683         token_range = result.range_m;
00684 
00685         return true;
00686     }
00687 
00688     putback();
00689 
00690     return false;
00691 }
00692 
00693 /*************************************************************************************************/
00694 
00695 template <typename O> // O models OutputIterator
00696 bool xml_parser_t<O>::is_token(xml_lex_token_set_t token_name)
00697 {
00698     const token_type& result(get_token());
00699 
00700     if (result.enum_m == token_name)
00701         return true;
00702 
00703     putback();
00704 
00705     return false;
00706 }
00707 
00708 /*************************************************************************************************/
00709 
00710 template <typename O> // O models OutputIterator
00711 void xml_parser_t<O>::require_token(xml_lex_token_set_t token_name, token_range_t& token_range)
00712 {
00713     const token_type& result(get_token());
00714 
00715     if (result.enum_m != token_name)
00716         throw_exception(result.enum_m, token_name);
00717 
00718     token_range = result.range_m;
00719 }
00720 
00721 /*************************************************************************************************/
00722 
00723 template <typename O> // O models OutputIterator
00724 void xml_parser_t<O>::require_token(xml_lex_token_set_t token_name)
00725 {
00726     const token_type& result(get_token());
00727 
00728     if (result.enum_m != token_name)
00729         throw_exception(result.enum_m, token_name);
00730 }
00731 
00732 /*************************************************************************************************/
00733 
00734 template <typename O> // O models OutputIterator
00735 void xml_parser_t<O>::content_callback( token_range_t&           result_element,
00736                                         const token_range_t&     old_element,
00737                                         const token_range_t&     start_tag,
00738                                         const attribute_set_t    attribute_set,
00739                                         const token_range_t&     content,
00740                                         bool                            preorder_parent)
00741 {
00742     if (preorder_parent)
00743     {
00744         // if we are in preorder mode and we are the preorder_parent,
00745         // we send the content to the client callback function.
00746         // We get back a single token_range, which we then parse all
00747         // over again in a content parser all its own.
00748 
00749         token_range_t new_content(callback_m(old_element, start_tag, attribute_set, content));
00750 
00751         if (old_element == new_content)
00752         {
00753             // In the case when the new content is the same as the old element,
00754             // the user has opted to echo the element to the output unchanged.
00755 
00756             adobe::copy(old_element, output_m);
00757         }
00758         else
00759         {
00760             // otherwise we need to parse the new content before we can move on to
00761             // the rest of the parse. The new parser has the same predicate and
00762             // output iterator as this one
00763 
00764             xml_parser_t<O>( new_content.first, new_content.second,
00765                                     next_position(), pred_m, callback_m, output_m).parse_content();
00766         }
00767 
00768         // once the token_range from the client has been parsed, we can turn off
00769         // preorder mode and resume parsing the original token stream from where we
00770         // left off.
00771 
00772         preorder_mode_m = false; // only the preorder_parent can turn off preorder mode
00773     }
00774     else
00775     {
00776         // in the case we are in preorder mode but we are not the initiator of
00777         // the mode, we are within the context of another preorder parse. In
00778         // this case we use the entire contents of the element as the token range
00779         // and hand it back as the return value of this function.
00780 
00781         result_element = old_element;
00782     }
00783 }
00784 
00785 /*************************************************************************************************/
00786 
00787 template <typename O> // O models OutputIterator
00788 bool xml_parser_t<O>::is_element(token_range_t& element)
00789 {
00790     element = token_range_t();
00791 
00792     attribute_set_t attribute_set;
00793 
00794     token_range_t   open_tag;
00795     token_range_t   close_tag;
00796 
00797     if (!is_token(xml_token_open_tag_k, open_tag)) return false;
00798 
00799     token_range_t   start_tag;
00800     token_range_t   end_tag;
00801 
00802     require_token(xml_token_name_k, start_tag);
00803 
00804     bool preorder_parent(false); // explained below
00805 
00806     // Preorder mode is a state for the entire parser. In this state the
00807     // client processing callback is never called until the end of the
00808     // current element is found. This precludes the processing of elements
00809     // and other entities nested within this element from being handled until
00810     // this containing element is processed. This is useful in the case when 
00811     // the content of the element could potentially be replaced, in which
00812     // case processing the nested elements first would be a moot point.
00813 
00814     if (!preorder_mode_m && pred_m)
00815     {
00816         // preorder mode is only set when the predicate is defined and
00817         // returns true for the start_tag of this element.
00818 
00819         preorder_mode_m = pred_m(start_tag);
00820 
00821 
00822         // preorder_parent is used to denote which frame in the stack began
00823         // the preorder traversal, as it is this frame alone that can turn
00824         // it back off again.
00825 
00826         preorder_parent = preorder_mode_m;
00827     }
00828 
00829     is_attribute_set(attribute_set);
00830 
00831     if (is_token(xml_token_slash_close_tag_k, close_tag))
00832     {
00833         if (preorder_mode_m)
00834         {
00835             content_callback(   element,
00836                                 token_range_t(open_tag.first, close_tag.second),
00837                                 start_tag,
00838                                 attribute_set,
00839                                 token_range_t(),
00840                                 preorder_parent);
00841         }
00842         else
00843         {
00844             // in the case when we are not in preorder mode at all, we pass the element
00845             // to the client callback and output the token_range we receive back.
00846 
00847             token_range_t result(callback_m( token_range_t(open_tag.first, close_tag.second),
00848                                                     start_tag,
00849                                                     attribute_set,
00850                                                     token_range_t()));
00851 
00852             adobe::copy(result, output_m);
00853         }
00854 
00855         return true;
00856     }
00857 
00858     token_range_t close_of_open_tag;
00859 
00860     require_token(xml_token_close_tag_k, close_of_open_tag);
00861 
00862     token_range_t content;
00863 
00864     // In the case of inorder parsing we want to output the tags
00865     // as we see them; in this case we need to output the opening
00866     // tag before we can go on to the content parsing.
00867 
00868     if (!preorder_mode_m)
00869         std::copy(open_tag.first, close_of_open_tag.second, output_m);
00870 
00871     if (!is_content(content))
00872         throw std::runtime_error("Content expected but not found.");
00873     
00874     if (!is_e_tag(end_tag, close_tag))
00875         throw std::runtime_error("End tag expected but not found.");
00876 
00877     if (!token_range_equal(start_tag, end_tag))
00878         throw std::runtime_error("Start tag and end tag do not have the same name.");
00879 
00880     if (!preorder_mode_m)
00881     {
00882         // in the case when we are not in preorder mode
00883         // we output the content we have immediately,
00884         // then we need to output the closing tag before
00885         // we can go on to the rest of the parse.
00886 
00887         adobe::copy(content, output_m);
00888         adobe::copy(token_range_t(end_tag.first - 2, end_tag.second + 1), output_m);
00889     }
00890     else
00891     {
00892         // In this instance we are continuing a preorder parse...
00893 
00894         content_callback(   element,
00895                             token_range_t(open_tag.first, close_tag.second),
00896                             start_tag,
00897                             attribute_set,
00898                             content,
00899                             preorder_parent);
00900     }
00901 
00902     return true;
00903 }
00904 
00905 /*************************************************************************************************/
00906 
00907 template <typename O> // O models OutputIterator
00908 bool xml_parser_t<O>::is_content(token_range_t& content)
00909 {
00910     content = token_range_t();
00911 
00912     token_range_t char_data;
00913 
00914     // NOTE (fbrereto) :    The content parser can never initiate a preorder mode.
00915     //                      It can only be initiated by the parsing of a preorder
00916     //                      element, which isn't handled here. So for the content
00917     //                      parse we are either in preorder mode or not; we need
00918     //                      not worry about managing it.
00919 
00920     if (is_token(xml_token_char_data_k, char_data))
00921     {
00922         // in the case when we are in preorder mode, we are part of a nested
00923         // content, and we want to use this beginning char_data token as the
00924         // start of the overall content token_range.
00925 
00926         if (preorder_mode_m)
00927             { content = char_data; }
00928 
00929         // in the case when we are not in preorder mode this range of char_data
00930         // needs to be sent directly to the output.
00931 
00932         else
00933             { adobe::copy(char_data, output_m); }
00934     }
00935 
00936     while (true)
00937     {
00938         token_range_t result;
00939 
00940         if (is_token(xml_token_reference_k, result))
00941         {
00942             if (boost::size(result))
00943             {
00944                 if (preorder_mode_m)
00945                 {
00946                     // Again, if we're in preorder mode we're not outputting
00947                     // but extending (possibly even starting, too) the token_range
00948                     // for the preorder element.
00949 
00950                     if (!content.first) content.first = result.first;
00951 
00952                     content.second = result.second;
00953                 }
00954                 else
00955                 {
00956                     // if we're not in preorder mode, we pass the element's
00957                     // reference-transformed token_range result directly to
00958                     // the output.
00959 
00960                     adobe::copy(implementation::transform_reference(result), output_m);
00961                 }
00962             }
00963         }
00964         else if (is_element(result))
00965         {
00966             if (boost::size(result))
00967             {
00968                 if (preorder_mode_m)
00969                 {
00970                     // Again, if we're in preorder mode we're not outputting
00971                     // but extending (possibly even starting, too) the token_range
00972                     // for the preorder element.
00973 
00974                     if (!content.first) content.first = result.first;
00975 
00976                     content.second = result.second;
00977                 }
00978                 else
00979                 {
00980                     // if we're not in preorder mode, we pass the element's
00981                     // token_range result directly to the output.
00982 
00983                     adobe::copy(result, output_m);
00984                 }
00985             }
00986         }
00987         else if (is_token(xml_token_comment_k, result))
00988         {
00989             // Comments are not parsed by any client functions.
00990             // They are merely ignored by the parser.
00991 
00992             // REVISIT eberdahl - Because some clients may want to
00993             // handle comments, we may want to extend the client
00994             // callback system to permit a comment callback.
00995         }
00996         else
00997             { break; }
00998 
00999         if (is_token(xml_token_char_data_k, char_data))
01000         {
01001             // if we find more char_data at the end of the content, we
01002             // either extent the preorder content data or we output
01003             // the contents of the char_data directly to the output (in
01004             // fullorder mode).
01005 
01006             if (preorder_mode_m)
01007                 { content.second = char_data.second; }
01008             else
01009                 { adobe::copy(char_data, output_m); }
01010         }
01011     }
01012 
01013     return true;
01014 }
01015 
01016 /*************************************************************************************************/
01017 
01018 template <typename O> // O models OutputIterator
01019 bool xml_parser_t<O>::is_e_tag(token_range_t& name, token_range_t& close_tag)
01020 {
01021     if (!is_token(xml_token_open_slash_tag_k)) return false;
01022 
01023     require_token(xml_token_name_k, name);
01024 
01025     require_token(xml_token_close_tag_k, close_tag);
01026 
01027     return true;
01028 }
01029 
01030 /*************************************************************************************************/
01031 
01032 template <typename O> // O models OutputIterator
01033 bool xml_parser_t<O>::is_attribute_set(attribute_set_t& attribute_set)
01034 {
01035     token_range_t att_name;
01036     token_range_t att_value;
01037 
01038     while (is_attribute(att_name, att_value))
01039         attribute_set.insert(att_name, att_value);
01040 
01041     return true;
01042 }
01043 
01044 /*************************************************************************************************/
01045 
01046 template <typename O> // O models OutputIterator
01047 bool xml_parser_t<O>::is_prolog()
01048 {
01049     token_range_t bom;
01050     token_range_t xml_decl;
01051 
01052     if (is_bom(bom))
01053     {
01054         // REVISIT eberdahl 2006 Jun 18 - sanity check the bom
01055     }
01056     
01057     if (is_xml_decl(xml_decl))
01058     {
01059         // REVISIT eberdahl 2006 Jun 18 - sanity check the encoding
01060         // of the XMLDecl
01061         
01062         return true;
01063     }
01064     
01065     return false;
01066 }
01067 
01068 /*************************************************************************************************/
01069 
01070 template <typename O> // O models OutputIterator
01071 bool xml_parser_t<O>::is_bom(token_range_t& bom)
01072 {
01073     const token_range_t utf8_bom = static_token_range("\xEF\xBB\xBF");
01074     const token_range_t utf16_be_bom = static_token_range("\xFE\xFF");
01075     const token_range_t utf16_le_bom = static_token_range("\xFF\xFE");
01076 
01077     bool result = false;
01078     
01079     // whitespace skipping should be off when sniffing for a bom
01080     token_stream_m.set_skip_white_space(false);
01081 
01082     if (is_token(xml_token_char_data_k, bom))
01083     {
01084         if (boost::size(utf8_bom) <= boost::size(bom) &&
01085             adobe::equal(utf8_bom, bom.first))
01086         {
01087             bom.second = bom.first;
01088             std::advance(bom.second, boost::size(utf8_bom));
01089 
01090             result = true;
01091         }
01092         else if (boost::size(utf16_be_bom) <= boost::size(bom) &&
01093                  adobe::equal(utf16_be_bom, bom.first))
01094         {
01095             // it's a bom, but it's not a format the parser supports
01096             throw_exception("utf16be bom encountered; xml_parser_t only supports utf8 encoding");           
01097         }
01098         else if (boost::size(utf16_le_bom) <= boost::size(bom) &&
01099                  adobe::equal(utf16_le_bom, bom.first))
01100         {
01101             // it's a bom, but it's not a format the parser supports
01102             throw_exception("utf16le bom encountered; xml_parser_t only supports utf8 encoding");           
01103         }
01104     }
01105 
01106     token_stream_m.set_skip_white_space(true);
01107 
01108     return result;
01109 }
01110 
01111 /*************************************************************************************************/
01112 
01113 template <typename O> // O models OutputIterator
01114 bool xml_parser_t<O>::is_xml_decl(token_range_t& xml_decl)
01115 {
01116     if (is_token(xml_token_processing_instruction_k, xml_decl))
01117     {
01118         // REVISIT eberdahl 2006 Jun 18 - sanity check that the PI
01119         // encountered is, in fact, targeted at the xml application
01120         
01121         return true;
01122     }
01123     
01124     return false;
01125 }
01126 
01127 /*************************************************************************************************/
01128 
01129 template <typename O> // O models OutputIterator
01130 bool xml_parser_t<O>::is_attribute(token_range_t& name, token_range_t& value)
01131 {
01132     if (is_token(xml_token_name_k, name))
01133     {
01134         require_token(xml_token_equals_k);
01135 
01136         require_token(xml_token_att_value_k, value);
01137 
01138         return true;
01139     }
01140 
01141     return false;
01142 }
01143 
01144 /*************************************************************************************************/
01145 
01146 template <typename O> // O models OutputIterator
01147 void xml_parser_t<O>::parse_element_sequence()
01148 {
01149     assert(callback_m);
01150     
01151     token_range_t dummy;
01152 
01153     token_stream_m.set_skip_white_space(false);
01154 
01155     while (is_element(dummy))
01156         is_token(xml_token_char_data_k);
01157 }
01158 
01159 /*************************************************************************************************/
01160 
01161 template <typename O> // O models OutputIterator
01162 void xml_parser_t<O>::parse_content()
01163 {
01164     token_range_t content;
01165 
01166     token_stream_m.set_skip_white_space(false);
01167 
01168     while (true)
01169     {
01170         // always returns true; have to test results
01171         is_content(content);
01172 
01173         if (boost::size(content))
01174         {
01175             token_range_t result(this->callback_m(   content,
01176                                                             token_range_t(),
01177                                                             attribute_set_t(),
01178                                                             content));
01179 
01180             adobe::copy(result, this->output_m);
01181         }
01182         else
01183             break;
01184     }
01185 }
01186 
01187 /*************************************************************************************************/
01188 
01189 template <typename O> // O models OutputIterator
01190 void xml_parser_t<O>::parse_document()
01191 {
01192     token_range_t dummy;
01193     
01194     token_stream_m.set_skip_white_space(true);
01195     
01196     is_prolog();
01197     is_element(dummy);
01198 }
01199 
01200 /*************************************************************************************************/
01201 
01221 template <typename O> // O models OutputIterator
01222 inline xml_parser_t<O> make_xml_parser( uchar_ptr_t                                     first,
01223                                         uchar_ptr_t                                     last,
01224                                         const line_position_t&                          position,
01225                                         typename xml_parser_t<O>::preorder_predicate_t  predicate,
01226                                         typename xml_parser_t<O>::callback_proc_t       callback,
01227                                         O                                               output)
01228 { return xml_parser_t<O>(first, last, position, predicate, callback, output); }
01229 
01230 /*************************************************************************************************/
01236 template <typename Result, typename InputIterator>
01237 InputIterator xatoi(InputIterator first, InputIterator last, Result& result)
01238 {
01239     result = 0;
01240 
01241     while (first != last && std::isxdigit(*first))
01242     {
01243         typename std::iterator_traits<InputIterator>::value_type c(*first);
01244 
01245         result <<= 4;
01246 
01247         if (std::isdigit(c))
01248         {
01249             result += c - '0';
01250         }
01251         else
01252         {
01253             c = std::use_facet<std::ctype<char> >(std::locale()).tolower(c);
01254 
01255             result += c - 'a' + 10;
01256         }
01257 
01258         ++first;
01259     }
01260 
01261     return first;
01262 }
01263 
01264 /*************************************************************************************************/
01270 template <typename Result, typename InputIterator>
01271 InputIterator datoi(InputIterator first, InputIterator last, Result& result)
01272 {
01273     result = 0;
01274 
01275     while (first != last && std::isdigit(*first))
01276     {
01277         result *= 10;
01278 
01279         result += *first - '0';
01280 
01281         ++first;
01282     }
01283 
01284     return first;
01285 }
01286 
01287 /*************************************************************************************************/
01288 
01289 } // namespace adobe
01290 
01291 /*************************************************************************************************/
01292 
01293 #endif
01294 
01295 /*************************************************************************************************/
Documentation

More Info

Media

Support

RSS

Other Adobe Projects

Other Resources

xml_parser.hpp