xml_parser.hpp
Go to the documentation of this file.
00001 /* 00002 Copyright 2005-2007 Adobe Systems Incorporated 00003 Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt 00004 or a copy at http://stlab.adobe.com/licenses.html) 00005 */ 00006 00007 /*************************************************************************************************/ 00008 00009 #ifndef ADOBE_XML_PARSER_HPP 00010 #define ADOBE_XML_PARSER_HPP 00011 00012 /*************************************************************************************************/ 00013 00014 #include <adobe/config.hpp> 00015 00016 #include <adobe/any_regular.hpp> 00017 #include <adobe/algorithm/set.hpp> 00018 #include <adobe/istream.hpp> 00019 #include <adobe/array.hpp> 00020 #include <adobe/copy_on_write.hpp> 00021 #include <adobe/name.hpp> 00022 #include <adobe/dictionary.hpp> 00023 #include <adobe/string.hpp> 00024 #include <adobe/implementation/xml_lex.hpp> 00025 #include <adobe/implementation/xml_token.hpp> 00026 #include <adobe/implementation/parser_shared.hpp> 00027 00028 #include <boost/function.hpp> 00029 #include <boost/noncopyable.hpp> 00030 #include <boost/operators.hpp> 00031 #include <boost/bind.hpp> 00032 #include <boost/array.hpp> 00033 #include <boost/iterator/iterator_facade.hpp> 00034 00035 #include <utility> 00036 #include <istream> 00037 #include <sstream> 00038 #include <iomanip> 00039 #include <cassert> 00040 #include <list> 00041 00042 /*************************************************************************************************/ 00043 00044 namespace adobe { 00045 00046 /*************************************************************************************************/ 00047 00048 // NOTE (fbrereto) : Class declaration for the documentation is in xml_parser.dox 00049 struct attribute_set_t : public boost::equality_comparable<attribute_set_t> 00050 { 00051 typedef token_range_t key_type; 00052 typedef token_range_t mapped_type; 00053 typedef std::pair<key_type, mapped_type> value_type; 00054 typedef std::vector<value_type> set_type; 00055 typedef set_type::size_type size_type; 00056 typedef set_type::const_iterator const_iterator; 00057 typedef const_iterator iterator; 00058 00065 struct less_t : std::binary_function<value_type, value_type, bool> 00066 { 00067 bool operator () (const value_type& x, const value_type& y) const 00068 { 00069 return token_range_less(x.first, y.first) || 00070 (!token_range_less(y.first, x.first) && 00071 token_range_less(x.second, y.second)); 00072 } 00073 }; 00074 00080 struct less_key_only_t : std::binary_function<value_type, value_type, bool> 00081 { 00082 bool operator () (const value_type& x, const value_type& y) const 00083 { 00084 return token_range_less(x.first, y.first); 00085 } 00086 }; 00087 00099 bool lower_bound(const value_type& attribute, set_type::iterator& result) 00100 { 00101 result = adobe::lower_bound(set_m.write(), attribute, less_key_only_t()); 00102 00103 return result != set_m.write().end() && 00104 token_range_equal(result->first, attribute.first); 00105 } 00106 00118 bool lower_bound(const key_type& key, set_type::iterator& result) 00119 { return lower_bound(value_type(key, mapped_type()), result); } 00120 00124 bool lower_bound(const value_type& attribute, set_type::const_iterator& result) const 00125 { 00126 result = adobe::lower_bound(*set_m, attribute, less_key_only_t()); 00127 00128 return result != set_m->end() && 00129 token_range_equal(result->first, attribute.first); 00130 } 00131 00135 bool lower_bound(const key_type& key, set_type::const_iterator& result) const 00136 { return lower_bound(value_type(key, mapped_type()), result); } 00137 00147 mapped_type operator [] (const key_type& key) const 00148 { 00149 set_type::const_iterator result; 00150 00151 if (lower_bound(key, result)) 00152 return result->second; 00153 00154 return mapped_type(); 00155 } 00156 00171 attribute_set_t merge(const attribute_set_t& other_set) const 00172 { 00173 00174 attribute_set_t merged; 00175 00176 adobe::set_union(*set_m, *other_set.set_m, std::back_inserter(merged.set_m.write()), less_key_only_t()); 00177 00178 return merged; 00179 } 00180 00192 void insert(const value_type& attribute) 00193 { 00194 set_type::iterator result; 00195 00196 if (lower_bound(attribute, result)) 00197 result->second = attribute.second; 00198 else 00199 set_m.write().insert(result, attribute); 00200 } 00201 00212 template <typename I> // I models InputIterator 00213 inline void insert(I first, I last) 00214 { for (; first != last; ++first) insert(*first); } 00215 00224 inline void insert(const key_type& key, const mapped_type& value) 00225 { insert(value_type(key, value)); } 00226 00237 std::size_t count_same(const attribute_set_t& other_set, bool mapped_matters = true) const; 00238 00252 bool has_collisions(const attribute_set_t& other_set) const; 00253 00263 std::size_t count_collisions(const attribute_set_t& other_set) const; 00264 00268 inline bool empty() const 00269 { return set_m->empty(); } 00270 00275 inline size_type size() const 00276 { return set_m->size(); } 00277 00282 const_iterator begin() const { return set_m->begin(); } 00283 00288 const_iterator end() const { return set_m->end(); } 00289 00296 void clear() { set_m.write().clear(); } 00297 00298 private: 00299 friend bool operator == (const attribute_set_t& x, const attribute_set_t& y); 00300 friend std::ostream& operator << (std::ostream& s, const attribute_set_t& attribute_set); 00301 00302 copy_on_write<set_type> set_m; 00303 }; 00304 00305 /*************************************************************************************************/ 00306 00319 inline bool operator == (const attribute_set_t& x, const attribute_set_t& y) 00320 { 00321 return x.set_m->size() == y.set_m->size() && x.count_same(y) == x.set_m->size(); 00322 } 00323 00324 /*************************************************************************************************/ 00325 00337 inline std::ostream& operator << (std::ostream& s, const attribute_set_t& attribute_set) 00338 { 00339 attribute_set_t::set_type::const_iterator first(attribute_set.set_m->begin()); 00340 attribute_set_t::set_type::const_iterator last(attribute_set.set_m->end()); 00341 bool not_first(false); 00342 00343 for (; first != last; ++first) 00344 { 00345 if (not_first) 00346 s << " "; 00347 else 00348 not_first = true; 00349 00350 adobe::copy(first->first, std::ostream_iterator<char>(s)); 00351 00352 s << "='"; 00353 00354 adobe::copy(first->second, std::ostream_iterator<char>(s)); 00355 00356 s << "'"; 00357 } 00358 00359 return s; 00360 } 00361 00362 /*************************************************************************************************/ 00363 00364 inline std::size_t attribute_set_t::count_same(const attribute_set_t& other_set, bool mapped_matters) const 00365 { 00366 std::size_t result(0); 00367 00368 if (mapped_matters) 00369 result = adobe::set_intersection( *set_m, *other_set.set_m, 00370 counting_output_iterator(), 00371 less_t()) 00372 .count(); 00373 else 00374 result = adobe::set_intersection( *set_m, *other_set.set_m, 00375 counting_output_iterator(), 00376 less_key_only_t()) 00377 .count(); 00378 00379 #if 0 00380 std::cerr << " count_same:\n" 00381 << " orig: " << *this << "\n" 00382 << " test: " << other_set << "\n" 00383 << " result: " << result << std::endl; 00384 #endif 00385 00386 return result; 00387 } 00388 00389 /*************************************************************************************************/ 00390 00391 inline bool attribute_set_t::has_collisions(const attribute_set_t& other_set) const 00392 { 00393 attribute_set_t::set_type::const_iterator first(set_m->begin()); 00394 attribute_set_t::set_type::const_iterator last(set_m->end()); 00395 00396 for (; first != last; ++first) 00397 { 00398 set_type::const_iterator result; 00399 00400 if (other_set.lower_bound(*first, result) && !token_range_equal(result->second, first->second)) 00401 return true; 00402 } 00403 00404 return false; 00405 } 00406 00407 /*************************************************************************************************/ 00408 00409 inline std::size_t attribute_set_t::count_collisions(const attribute_set_t& other_set) const 00410 { 00411 attribute_set_t::set_type::const_iterator first(set_m->begin()); 00412 attribute_set_t::set_type::const_iterator last(set_m->end()); 00413 std::size_t collision_count(0); 00414 00415 for (; first != last; ++first) 00416 { 00417 set_type::const_iterator result; 00418 00419 if (other_set.lower_bound(*first, result) && result->second != first->second) 00420 ++collision_count; 00421 } 00422 00423 return collision_count; 00424 } 00425 00426 /*************************************************************************************************/ 00427 00428 // REVISIT (sparent) : Extra typedef just for the doxygen tool. 00429 00430 typedef token_range_t (implementation_xml_element_proc_t)( 00431 const token_range_t& entire_element_range, 00432 const token_range_t& name, 00433 const attribute_set_t& attribute_set, 00434 const token_range_t& value); 00435 00436 typedef boost::function<implementation_xml_element_proc_t> xml_element_proc_t; 00437 00438 /*************************************************************************************************/ 00439 00440 // NOTE (fbrereto) : Class declaration for the documentation is in xml_parser.dox 00441 template <typename O> // O models OutputIterator 00442 class xml_parser_t : public boost::noncopyable 00443 { 00444 public: 00445 typedef xml_element_proc_t callback_proc_t; 00446 typedef boost::function<bool (const token_range_t&)> preorder_predicate_t; 00447 typedef xml_lex_t::token_type token_type; 00448 00449 xml_parser_t( uchar_ptr_t first, 00450 uchar_ptr_t last, 00451 const line_position_t& position, 00452 preorder_predicate_t predicate, 00453 callback_proc_t callback, 00454 O output) : 00455 pred_m(predicate), 00456 callback_m(callback), 00457 output_m(output), 00458 token_stream_m(first, last, position), 00459 preorder_mode_m(false) 00460 { } 00461 00462 xml_parser_t(const xml_parser_t& rhs) : 00463 pred_m(rhs.pred_m), 00464 callback_m(rhs.callback_m), 00465 output_m(rhs.output_m), 00466 token_stream_m(rhs.token_stream_m), 00467 preorder_mode_m(rhs.preorder_mode_m) 00468 { } 00469 00470 xml_parser_t& operator = (const xml_parser_t& rhs) 00471 { 00472 pred_m = rhs.pred_m; 00473 callback_m = rhs.callback_m; 00474 output_m = rhs.output_m; 00475 token_stream_m = rhs.token_stream_m; 00476 preorder_mode_m = rhs.preorder_mode_m; 00477 00478 return *this; 00479 } 00480 00481 virtual ~xml_parser_t() 00482 { } 00483 00484 const line_position_t& next_position() 00485 { return token_stream_m.next_position(); } 00486 00492 void set_preorder_predicate(preorder_predicate_t pred) 00493 { pred_m = pred; } 00494 00518 void parse_element_sequence(); 00519 00565 void parse_content(); 00566 00570 void parse_document(); 00571 00572 /* 00573 REVISIT (sparent) : We should provide a protected call to get the token stream and allow 00574 subclasses to access it directly - but for now we'll stick with the law of Demiter. 00575 */ 00576 00577 protected: 00578 const token_type& get_token() 00579 { return token_stream_m.get(); } 00580 void putback() 00581 { token_stream_m.putback(); } 00582 00583 bool is_token(xml_lex_token_set_t name, token_range_t& value); 00584 bool is_token(xml_lex_token_set_t name); 00585 void require_token(xml_lex_token_set_t name, token_range_t& value); 00586 void require_token(xml_lex_token_set_t name); 00587 00588 /* REVISIT (sparent) : Should these be const? And is there a way to specify the class to throw? */ 00589 00590 void throw_exception(const char* error_string) 00591 { throw_parser_exception(error_string, next_position()); } 00592 void throw_exception(xml_lex_token_set_t found, xml_lex_token_set_t expected) 00593 { throw_parser_exception(token_to_string(found), token_to_string(expected), next_position()); } 00594 00595 bool is_element(token_range_t& element); 00596 bool is_content(token_range_t& element); 00597 bool is_e_tag(token_range_t& name, token_range_t& close_tag); 00598 bool is_attribute_set(attribute_set_t& attribute_set); 00599 bool is_attribute(token_range_t& name, token_range_t& value); 00600 bool is_prolog(); 00601 bool is_bom(token_range_t& bom); 00602 bool is_xml_decl(token_range_t& xml_decl); 00603 00604 void content_callback( token_range_t& result_element, 00605 const token_range_t& old_element, 00606 const token_range_t& start_tag, 00607 const attribute_set_t attribute_set, 00608 const token_range_t& content, 00609 bool preorder_parent); 00610 00611 preorder_predicate_t pred_m; 00612 callback_proc_t callback_m; 00613 O output_m; 00614 00615 private: 00616 xml_lex_t token_stream_m; 00617 bool preorder_mode_m; 00618 }; 00619 00620 /*************************************************************************************************/ 00621 00622 inline token_range_t xml_element_echo( const token_range_t& entire_element_range, 00623 const token_range_t& /*name*/, 00624 const attribute_set_t& /*attribute_set*/, 00625 const token_range_t& /*value*/) 00626 { return entire_element_range; } 00627 00628 /*************************************************************************************************/ 00629 00630 inline token_range_t xml_element_strip( const token_range_t& /*entire_element_range*/, 00631 const token_range_t& /*name*/, 00632 const attribute_set_t& /*attribute_set*/, 00633 const token_range_t& value) 00634 { return value; } 00635 00636 /*************************************************************************************************/ 00637 00638 inline token_range_t xml_element_linefeed( const token_range_t& /*entire_element_range*/, 00639 const token_range_t& name, 00640 const attribute_set_t& attribute_set, 00641 const token_range_t& value) 00642 { 00643 if (token_range_equal(name, static_token_range("br")) && 00644 attribute_set.empty() && 00645 boost::size(value) == 0) 00646 { 00647 #if ADOBE_PLATFORM_WIN 00648 return static_token_range("&cr;&lf;"); 00649 #elif ADOBE_PLATFORM_MAC 00650 return static_token_range("&cr;"); 00651 #elif ADOBE_PLATFORM_UNIX || ADOBE_PLATFORM_LINUX || ADOBE_PLATFORM_BSD || ADOBE_PLATFORM_SOLARIS ||\ 00652 ADOBE_PLATFORM_IRIX || ADOBE_PLATFORM_HPUX || ADOBE_PLATFORM_CYGWIN || ADOBE_PLATFORM_AIX 00653 return static_token_range("&lf;"); 00654 #else 00655 #error "Line ending for platform unknown - please configure and report the results to stlab.adobe.com" 00656 #endif 00657 } 00658 00659 return value; 00660 } 00661 00662 /*************************************************************************************************/ 00663 00664 namespace implementation { 00665 00666 /*************************************************************************************************/ 00667 00668 token_range_t transform_reference(const token_range_t& reference); 00669 00670 /*************************************************************************************************/ 00671 00672 } // namespace implementation 00673 00674 /*************************************************************************************************/ 00675 00676 template <typename O> // O models OutputIterator 00677 bool xml_parser_t<O>::is_token(xml_lex_token_set_t token_name, token_range_t& token_range) 00678 { 00679 const token_type& result(get_token()); 00680 00681 if (result.enum_m == token_name) 00682 { 00683 token_range = result.range_m; 00684 00685 return true; 00686 } 00687 00688 putback(); 00689 00690 return false; 00691 } 00692 00693 /*************************************************************************************************/ 00694 00695 template <typename O> // O models OutputIterator 00696 bool xml_parser_t<O>::is_token(xml_lex_token_set_t token_name) 00697 { 00698 const token_type& result(get_token()); 00699 00700 if (result.enum_m == token_name) 00701 return true; 00702 00703 putback(); 00704 00705 return false; 00706 } 00707 00708 /*************************************************************************************************/ 00709 00710 template <typename O> // O models OutputIterator 00711 void xml_parser_t<O>::require_token(xml_lex_token_set_t token_name, token_range_t& token_range) 00712 { 00713 const token_type& result(get_token()); 00714 00715 if (result.enum_m != token_name) 00716 throw_exception(result.enum_m, token_name); 00717 00718 token_range = result.range_m; 00719 } 00720 00721 /*************************************************************************************************/ 00722 00723 template <typename O> // O models OutputIterator 00724 void xml_parser_t<O>::require_token(xml_lex_token_set_t token_name) 00725 { 00726 const token_type& result(get_token()); 00727 00728 if (result.enum_m != token_name) 00729 throw_exception(result.enum_m, token_name); 00730 } 00731 00732 /*************************************************************************************************/ 00733 00734 template <typename O> // O models OutputIterator 00735 void xml_parser_t<O>::content_callback( token_range_t& result_element, 00736 const token_range_t& old_element, 00737 const token_range_t& start_tag, 00738 const attribute_set_t attribute_set, 00739 const token_range_t& content, 00740 bool preorder_parent) 00741 { 00742 if (preorder_parent) 00743 { 00744 // if we are in preorder mode and we are the preorder_parent, 00745 // we send the content to the client callback function. 00746 // We get back a single token_range, which we then parse all 00747 // over again in a content parser all its own. 00748 00749 token_range_t new_content(callback_m(old_element, start_tag, attribute_set, content)); 00750 00751 if (old_element == new_content) 00752 { 00753 // In the case when the new content is the same as the old element, 00754 // the user has opted to echo the element to the output unchanged. 00755 00756 adobe::copy(old_element, output_m); 00757 } 00758 else 00759 { 00760 // otherwise we need to parse the new content before we can move on to 00761 // the rest of the parse. The new parser has the same predicate and 00762 // output iterator as this one 00763 00764 xml_parser_t<O>( new_content.first, new_content.second, 00765 next_position(), pred_m, callback_m, output_m).parse_content(); 00766 } 00767 00768 // once the token_range from the client has been parsed, we can turn off 00769 // preorder mode and resume parsing the original token stream from where we 00770 // left off. 00771 00772 preorder_mode_m = false; // only the preorder_parent can turn off preorder mode 00773 } 00774 else 00775 { 00776 // in the case we are in preorder mode but we are not the initiator of 00777 // the mode, we are within the context of another preorder parse. In 00778 // this case we use the entire contents of the element as the token range 00779 // and hand it back as the return value of this function. 00780 00781 result_element = old_element; 00782 } 00783 } 00784 00785 /*************************************************************************************************/ 00786 00787 template <typename O> // O models OutputIterator 00788 bool xml_parser_t<O>::is_element(token_range_t& element) 00789 { 00790 element = token_range_t(); 00791 00792 attribute_set_t attribute_set; 00793 00794 token_range_t open_tag; 00795 token_range_t close_tag; 00796 00797 if (!is_token(xml_token_open_tag_k, open_tag)) return false; 00798 00799 token_range_t start_tag; 00800 token_range_t end_tag; 00801 00802 require_token(xml_token_name_k, start_tag); 00803 00804 bool preorder_parent(false); // explained below 00805 00806 // Preorder mode is a state for the entire parser. In this state the 00807 // client processing callback is never called until the end of the 00808 // current element is found. This precludes the processing of elements 00809 // and other entities nested within this element from being handled until 00810 // this containing element is processed. This is useful in the case when 00811 // the content of the element could potentially be replaced, in which 00812 // case processing the nested elements first would be a moot point. 00813 00814 if (!preorder_mode_m && pred_m) 00815 { 00816 // preorder mode is only set when the predicate is defined and 00817 // returns true for the start_tag of this element. 00818 00819 preorder_mode_m = pred_m(start_tag); 00820 00821 00822 // preorder_parent is used to denote which frame in the stack began 00823 // the preorder traversal, as it is this frame alone that can turn 00824 // it back off again. 00825 00826 preorder_parent = preorder_mode_m; 00827 } 00828 00829 is_attribute_set(attribute_set); 00830 00831 if (is_token(xml_token_slash_close_tag_k, close_tag)) 00832 { 00833 if (preorder_mode_m) 00834 { 00835 content_callback( element, 00836 token_range_t(open_tag.first, close_tag.second), 00837 start_tag, 00838 attribute_set, 00839 token_range_t(), 00840 preorder_parent); 00841 } 00842 else 00843 { 00844 // in the case when we are not in preorder mode at all, we pass the element 00845 // to the client callback and output the token_range we receive back. 00846 00847 token_range_t result(callback_m( token_range_t(open_tag.first, close_tag.second), 00848 start_tag, 00849 attribute_set, 00850 token_range_t())); 00851 00852 adobe::copy(result, output_m); 00853 } 00854 00855 return true; 00856 } 00857 00858 token_range_t close_of_open_tag; 00859 00860 require_token(xml_token_close_tag_k, close_of_open_tag); 00861 00862 token_range_t content; 00863 00864 // In the case of inorder parsing we want to output the tags 00865 // as we see them; in this case we need to output the opening 00866 // tag before we can go on to the content parsing. 00867 00868 if (!preorder_mode_m) 00869 std::copy(open_tag.first, close_of_open_tag.second, output_m); 00870 00871 if (!is_content(content)) 00872 throw std::runtime_error("Content expected but not found."); 00873 00874 if (!is_e_tag(end_tag, close_tag)) 00875 throw std::runtime_error("End tag expected but not found."); 00876 00877 if (!token_range_equal(start_tag, end_tag)) 00878 throw std::runtime_error("Start tag and end tag do not have the same name."); 00879 00880 if (!preorder_mode_m) 00881 { 00882 // in the case when we are not in preorder mode 00883 // we output the content we have immediately, 00884 // then we need to output the closing tag before 00885 // we can go on to the rest of the parse. 00886 00887 adobe::copy(content, output_m); 00888 adobe::copy(token_range_t(end_tag.first - 2, end_tag.second + 1), output_m); 00889 } 00890 else 00891 { 00892 // In this instance we are continuing a preorder parse... 00893 00894 content_callback( element, 00895 token_range_t(open_tag.first, close_tag.second), 00896 start_tag, 00897 attribute_set, 00898 content, 00899 preorder_parent); 00900 } 00901 00902 return true; 00903 } 00904 00905 /*************************************************************************************************/ 00906 00907 template <typename O> // O models OutputIterator 00908 bool xml_parser_t<O>::is_content(token_range_t& content) 00909 { 00910 content = token_range_t(); 00911 00912 token_range_t char_data; 00913 00914 // NOTE (fbrereto) : The content parser can never initiate a preorder mode. 00915 // It can only be initiated by the parsing of a preorder 00916 // element, which isn't handled here. So for the content 00917 // parse we are either in preorder mode or not; we need 00918 // not worry about managing it. 00919 00920 if (is_token(xml_token_char_data_k, char_data)) 00921 { 00922 // in the case when we are in preorder mode, we are part of a nested 00923 // content, and we want to use this beginning char_data token as the 00924 // start of the overall content token_range. 00925 00926 if (preorder_mode_m) 00927 { content = char_data; } 00928 00929 // in the case when we are not in preorder mode this range of char_data 00930 // needs to be sent directly to the output. 00931 00932 else 00933 { adobe::copy(char_data, output_m); } 00934 } 00935 00936 while (true) 00937 { 00938 token_range_t result; 00939 00940 if (is_token(xml_token_reference_k, result)) 00941 { 00942 if (boost::size(result)) 00943 { 00944 if (preorder_mode_m) 00945 { 00946 // Again, if we're in preorder mode we're not outputting 00947 // but extending (possibly even starting, too) the token_range 00948 // for the preorder element. 00949 00950 if (!content.first) content.first = result.first; 00951 00952 content.second = result.second; 00953 } 00954 else 00955 { 00956 // if we're not in preorder mode, we pass the element's 00957 // reference-transformed token_range result directly to 00958 // the output. 00959 00960 adobe::copy(implementation::transform_reference(result), output_m); 00961 } 00962 } 00963 } 00964 else if (is_element(result)) 00965 { 00966 if (boost::size(result)) 00967 { 00968 if (preorder_mode_m) 00969 { 00970 // Again, if we're in preorder mode we're not outputting 00971 // but extending (possibly even starting, too) the token_range 00972 // for the preorder element. 00973 00974 if (!content.first) content.first = result.first; 00975 00976 content.second = result.second; 00977 } 00978 else 00979 { 00980 // if we're not in preorder mode, we pass the element's 00981 // token_range result directly to the output. 00982 00983 adobe::copy(result, output_m); 00984 } 00985 } 00986 } 00987 else if (is_token(xml_token_comment_k, result)) 00988 { 00989 // Comments are not parsed by any client functions. 00990 // They are merely ignored by the parser. 00991 00992 // REVISIT eberdahl - Because some clients may want to 00993 // handle comments, we may want to extend the client 00994 // callback system to permit a comment callback. 00995 } 00996 else 00997 { break; } 00998 00999 if (is_token(xml_token_char_data_k, char_data)) 01000 { 01001 // if we find more char_data at the end of the content, we 01002 // either extent the preorder content data or we output 01003 // the contents of the char_data directly to the output (in 01004 // fullorder mode). 01005 01006 if (preorder_mode_m) 01007 { content.second = char_data.second; } 01008 else 01009 { adobe::copy(char_data, output_m); } 01010 } 01011 } 01012 01013 return true; 01014 } 01015 01016 /*************************************************************************************************/ 01017 01018 template <typename O> // O models OutputIterator 01019 bool xml_parser_t<O>::is_e_tag(token_range_t& name, token_range_t& close_tag) 01020 { 01021 if (!is_token(xml_token_open_slash_tag_k)) return false; 01022 01023 require_token(xml_token_name_k, name); 01024 01025 require_token(xml_token_close_tag_k, close_tag); 01026 01027 return true; 01028 } 01029 01030 /*************************************************************************************************/ 01031 01032 template <typename O> // O models OutputIterator 01033 bool xml_parser_t<O>::is_attribute_set(attribute_set_t& attribute_set) 01034 { 01035 token_range_t att_name; 01036 token_range_t att_value; 01037 01038 while (is_attribute(att_name, att_value)) 01039 attribute_set.insert(att_name, att_value); 01040 01041 return true; 01042 } 01043 01044 /*************************************************************************************************/ 01045 01046 template <typename O> // O models OutputIterator 01047 bool xml_parser_t<O>::is_prolog() 01048 { 01049 token_range_t bom; 01050 token_range_t xml_decl; 01051 01052 if (is_bom(bom)) 01053 { 01054 // REVISIT eberdahl 2006 Jun 18 - sanity check the bom 01055 } 01056 01057 if (is_xml_decl(xml_decl)) 01058 { 01059 // REVISIT eberdahl 2006 Jun 18 - sanity check the encoding 01060 // of the XMLDecl 01061 01062 return true; 01063 } 01064 01065 return false; 01066 } 01067 01068 /*************************************************************************************************/ 01069 01070 template <typename O> // O models OutputIterator 01071 bool xml_parser_t<O>::is_bom(token_range_t& bom) 01072 { 01073 const token_range_t utf8_bom = static_token_range("\xEF\xBB\xBF"); 01074 const token_range_t utf16_be_bom = static_token_range("\xFE\xFF"); 01075 const token_range_t utf16_le_bom = static_token_range("\xFF\xFE"); 01076 01077 bool result = false; 01078 01079 // whitespace skipping should be off when sniffing for a bom 01080 token_stream_m.set_skip_white_space(false); 01081 01082 if (is_token(xml_token_char_data_k, bom)) 01083 { 01084 if (boost::size(utf8_bom) <= boost::size(bom) && 01085 adobe::equal(utf8_bom, bom.first)) 01086 { 01087 bom.second = bom.first; 01088 std::advance(bom.second, boost::size(utf8_bom)); 01089 01090 result = true; 01091 } 01092 else if (boost::size(utf16_be_bom) <= boost::size(bom) && 01093 adobe::equal(utf16_be_bom, bom.first)) 01094 { 01095 // it's a bom, but it's not a format the parser supports 01096 throw_exception("utf16be bom encountered; xml_parser_t only supports utf8 encoding"); 01097 } 01098 else if (boost::size(utf16_le_bom) <= boost::size(bom) && 01099 adobe::equal(utf16_le_bom, bom.first)) 01100 { 01101 // it's a bom, but it's not a format the parser supports 01102 throw_exception("utf16le bom encountered; xml_parser_t only supports utf8 encoding"); 01103 } 01104 } 01105 01106 token_stream_m.set_skip_white_space(true); 01107 01108 return result; 01109 } 01110 01111 /*************************************************************************************************/ 01112 01113 template <typename O> // O models OutputIterator 01114 bool xml_parser_t<O>::is_xml_decl(token_range_t& xml_decl) 01115 { 01116 if (is_token(xml_token_processing_instruction_k, xml_decl)) 01117 { 01118 // REVISIT eberdahl 2006 Jun 18 - sanity check that the PI 01119 // encountered is, in fact, targeted at the xml application 01120 01121 return true; 01122 } 01123 01124 return false; 01125 } 01126 01127 /*************************************************************************************************/ 01128 01129 template <typename O> // O models OutputIterator 01130 bool xml_parser_t<O>::is_attribute(token_range_t& name, token_range_t& value) 01131 { 01132 if (is_token(xml_token_name_k, name)) 01133 { 01134 require_token(xml_token_equals_k); 01135 01136 require_token(xml_token_att_value_k, value); 01137 01138 return true; 01139 } 01140 01141 return false; 01142 } 01143 01144 /*************************************************************************************************/ 01145 01146 template <typename O> // O models OutputIterator 01147 void xml_parser_t<O>::parse_element_sequence() 01148 { 01149 assert(callback_m); 01150 01151 token_range_t dummy; 01152 01153 token_stream_m.set_skip_white_space(false); 01154 01155 while (is_element(dummy)) 01156 is_token(xml_token_char_data_k); 01157 } 01158 01159 /*************************************************************************************************/ 01160 01161 template <typename O> // O models OutputIterator 01162 void xml_parser_t<O>::parse_content() 01163 { 01164 token_range_t content; 01165 01166 token_stream_m.set_skip_white_space(false); 01167 01168 while (true) 01169 { 01170 // always returns true; have to test results 01171 is_content(content); 01172 01173 if (boost::size(content)) 01174 { 01175 token_range_t result(this->callback_m( content, 01176 token_range_t(), 01177 attribute_set_t(), 01178 content)); 01179 01180 adobe::copy(result, this->output_m); 01181 } 01182 else 01183 break; 01184 } 01185 } 01186 01187 /*************************************************************************************************/ 01188 01189 template <typename O> // O models OutputIterator 01190 void xml_parser_t<O>::parse_document() 01191 { 01192 token_range_t dummy; 01193 01194 token_stream_m.set_skip_white_space(true); 01195 01196 is_prolog(); 01197 is_element(dummy); 01198 } 01199 01200 /*************************************************************************************************/ 01201 01221 template <typename O> // O models OutputIterator 01222 inline xml_parser_t<O> make_xml_parser( uchar_ptr_t first, 01223 uchar_ptr_t last, 01224 const line_position_t& position, 01225 typename xml_parser_t<O>::preorder_predicate_t predicate, 01226 typename xml_parser_t<O>::callback_proc_t callback, 01227 O output) 01228 { return xml_parser_t<O>(first, last, position, predicate, callback, output); } 01229 01230 /*************************************************************************************************/ 01236 template <typename Result, typename InputIterator> 01237 InputIterator xatoi(InputIterator first, InputIterator last, Result& result) 01238 { 01239 result = 0; 01240 01241 while (first != last && std::isxdigit(*first)) 01242 { 01243 typename std::iterator_traits<InputIterator>::value_type c(*first); 01244 01245 result <<= 4; 01246 01247 if (std::isdigit(c)) 01248 { 01249 result += c - '0'; 01250 } 01251 else 01252 { 01253 c = std::use_facet<std::ctype<char> >(std::locale()).tolower(c); 01254 01255 result += c - 'a' + 10; 01256 } 01257 01258 ++first; 01259 } 01260 01261 return first; 01262 } 01263 01264 /*************************************************************************************************/ 01270 template <typename Result, typename InputIterator> 01271 InputIterator datoi(InputIterator first, InputIterator last, Result& result) 01272 { 01273 result = 0; 01274 01275 while (first != last && std::isdigit(*first)) 01276 { 01277 result *= 10; 01278 01279 result += *first - '0'; 01280 01281 ++first; 01282 } 01283 01284 return first; 01285 } 01286 01287 /*************************************************************************************************/ 01288 01289 } // namespace adobe 01290 01291 /*************************************************************************************************/ 01292 01293 #endif 01294 01295 /*************************************************************************************************/ |