htmlparser

HTML Parser By John Resig

当前为 2014-08-26 提交的版本,查看 最新版本

此脚本不应直接安装。它是供其他脚本使用的外部库,要使用该库请加入元指令 // @require https://update.gf.qytechs.cn/scripts/4535/15016/htmlparser.js

  1. /*
  2. * HTML Parser By John Resig (ejohn.org)
  3. * Original code by Erik Arvidsson, Mozilla Public License
  4. * http://erik.eae.net/simplehtmlparser/simplehtmlparser.js
  5. *
  6. * // Use like so:
  7. * HTMLParser(htmlString, {
  8. * start: function(tag, attrs, unary) {},
  9. * end: function(tag) {},
  10. * chars: function(text) {},
  11. * comment: function(text) {}
  12. * });
  13. *
  14. * // or to get an XML string:
  15. * HTMLtoXML(htmlString);
  16. *
  17. * // or to get an XML DOM Document
  18. * HTMLtoDOM(htmlString);
  19. *
  20. * // or to inject into an existing document/DOM node
  21. * HTMLtoDOM(htmlString, document);
  22. * HTMLtoDOM(htmlString, document.body);
  23. *
  24. */
  25.  
  26. (function(){
  27.  
  28. // Regular Expressions for parsing tags and attributes
  29. var startTag = /^<([-A-Za-z0-9_]+)((?:\s+[-A-Za-z0-9_]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>/,
  30. endTag = /^<\/([-A-Za-z0-9_]+)[^>]*>/,
  31. attr = /([-A-Za-z0-9_]+)(?:\s*=\s*(?:(?:"((?:\\.|[^"])*)")|(?:'((?:\\.|[^'])*)')|([^>\s]+)))?/g;
  32. // Empty Elements - HTML 4.01
  33. var empty = makeMap("area,base,basefont,br,col,frame,hr,img,input,isindex,link,meta,param,embed");
  34.  
  35. // Block Elements - HTML 4.01
  36. var block = makeMap("address,applet,blockquote,button,center,dd,del,dir,div,dl,dt,fieldset,form,frameset,hr,iframe,ins,isindex,li,map,menu,noframes,noscript,object,ol,p,pre,script,table,tbody,td,tfoot,th,thead,tr,ul");
  37.  
  38. // Inline Elements - HTML 4.01
  39. var inline = makeMap("a,abbr,acronym,applet,b,basefont,bdo,big,br,button,cite,code,del,dfn,em,font,i,iframe,img,input,ins,kbd,label,map,object,q,s,samp,script,select,small,span,strike,strong,sub,sup,textarea,tt,u,var");
  40.  
  41. // Elements that you can, intentionally, leave open
  42. // (and which close themselves)
  43. var closeSelf = makeMap("colgroup,dd,dt,li,options,p,td,tfoot,th,thead,tr");
  44.  
  45. // Attributes that have their values filled in disabled="disabled"
  46. var fillAttrs = makeMap("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected");
  47.  
  48. // Special Elements (can contain anything)
  49. var special = makeMap("script,style");
  50.  
  51. var HTMLParser = this.HTMLParser = function( html, handler ) {
  52. var index, chars, match, stack = [], last = html;
  53. stack.last = function(){
  54. return this[ this.length - 1 ];
  55. };
  56.  
  57. while ( html ) {
  58. chars = true;
  59.  
  60. // Make sure we're not in a script or style element
  61. if ( !stack.last() || !special[ stack.last() ] ) {
  62.  
  63. // Comment
  64. if ( html.indexOf("<!--") == 0 ) {
  65. index = html.indexOf("-->");
  66. if ( index >= 0 ) {
  67. if ( handler.comment )
  68. handler.comment( html.substring( 4, index ) );
  69. html = html.substring( index + 3 );
  70. chars = false;
  71. }
  72. // end tag
  73. } else if ( html.indexOf("</") == 0 ) {
  74. match = html.match( endTag );
  75. if ( match ) {
  76. html = html.substring( match[0].length );
  77. match[0].replace( endTag, parseEndTag );
  78. chars = false;
  79. }
  80. // start tag
  81. } else if ( html.indexOf("<") == 0 ) {
  82. match = html.match( startTag );
  83. if ( match ) {
  84. html = html.substring( match[0].length );
  85. match[0].replace( startTag, parseStartTag );
  86. chars = false;
  87. }
  88. }
  89.  
  90. if ( chars ) {
  91. index = html.indexOf("<");
  92. var text = index < 0 ? html : html.substring( 0, index );
  93. html = index < 0 ? "" : html.substring( index );
  94. if ( handler.chars )
  95. handler.chars( text );
  96. }
  97.  
  98. } else {
  99. html = html.replace(new RegExp("(.*)<\/" + stack.last() + "[^>]*>"), function(all, text){
  100. text = text.replace(/<!--(.*?)-->/g, "$1")
  101. .replace(/<!\[CDATA\[(.*?)]]>/g, "$1");
  102.  
  103. if ( handler.chars )
  104. handler.chars( text );
  105.  
  106. return "";
  107. });
  108.  
  109. parseEndTag( "", stack.last() );
  110. }
  111.  
  112. if ( html == last )
  113. throw "Parse Error: " + html;
  114. last = html;
  115. }
  116. // Clean up any remaining tags
  117. parseEndTag();
  118.  
  119. function parseStartTag( tag, tagName, rest, unary ) {
  120. tagName = tagName.toLowerCase();
  121.  
  122. if ( block[ tagName ] ) {
  123. while ( stack.last() && inline[ stack.last() ] ) {
  124. parseEndTag( "", stack.last() );
  125. }
  126. }
  127.  
  128. if ( closeSelf[ tagName ] && stack.last() == tagName ) {
  129. parseEndTag( "", tagName );
  130. }
  131.  
  132. unary = empty[ tagName ] || !!unary;
  133.  
  134. if ( !unary )
  135. stack.push( tagName );
  136. if ( handler.start ) {
  137. var attrs = [];
  138. rest.replace(attr, function(match, name) {
  139. var value = arguments[2] ? arguments[2] :
  140. arguments[3] ? arguments[3] :
  141. arguments[4] ? arguments[4] :
  142. fillAttrs[name] ? name : "";
  143. attrs.push({
  144. name: name,
  145. value: value,
  146. escaped: value.replace(/(^|[^\\])"/g, '$1\\\"') //"
  147. });
  148. });
  149. if ( handler.start )
  150. handler.start( tagName, attrs, unary );
  151. }
  152. }
  153.  
  154. function parseEndTag( tag, tagName ) {
  155. // If no tag name is provided, clean shop
  156. if ( !tagName )
  157. var pos = 0;
  158. // Find the closest opened tag of the same type
  159. else
  160. for ( var pos = stack.length - 1; pos >= 0; pos-- )
  161. if ( stack[ pos ] == tagName )
  162. break;
  163. if ( pos >= 0 ) {
  164. // Close all the open elements, up the stack
  165. for ( var i = stack.length - 1; i >= pos; i-- )
  166. if ( handler.end )
  167. handler.end( stack[ i ] );
  168. // Remove the open elements from the stack
  169. stack.length = pos;
  170. }
  171. }
  172. };
  173. this.HTMLtoXML = function( html ) {
  174. var results = "";
  175. HTMLParser(html, {
  176. start: function( tag, attrs, unary ) {
  177. results += "<" + tag;
  178. for ( var i = 0; i < attrs.length; i++ )
  179. results += " " + attrs[i].name + '="' + attrs[i].escaped + '"';
  180. results += (unary ? "/" : "") + ">";
  181. },
  182. end: function( tag ) {
  183. results += "</" + tag + ">";
  184. },
  185. chars: function( text ) {
  186. results += text;
  187. },
  188. comment: function( text ) {
  189. results += "<!--" + text + "-->";
  190. }
  191. });
  192. return results;
  193. };
  194. this.HTMLtoDOM = function( html, doc ) {
  195. // There can be only one of these elements
  196. var one = makeMap("html,head,body,title");
  197. // Enforce a structure for the document
  198. var structure = {
  199. link: "head",
  200. base: "head"
  201. };
  202. if ( !doc ) {
  203. if ( typeof DOMDocument != "undefined" )
  204. doc = new DOMDocument();
  205. else if ( typeof document != "undefined" && document.implementation && document.implementation.createDocument )
  206. doc = document.implementation.createDocument("", "", null);
  207. else if ( typeof ActiveX != "undefined" )
  208. doc = new ActiveXObject("Msxml.DOMDocument");
  209. } else
  210. doc = doc.ownerDocument ||
  211. doc.getOwnerDocument && doc.getOwnerDocument() ||
  212. doc;
  213. var elems = [],
  214. documentElement = doc.documentElement ||
  215. doc.getDocumentElement && doc.getDocumentElement();
  216. // If we're dealing with an empty document then we
  217. // need to pre-populate it with the HTML document structure
  218. if ( !documentElement && doc.createElement ) (function(){
  219. var html = doc.createElement("html");
  220. var head = doc.createElement("head");
  221. head.appendChild( doc.createElement("title") );
  222. html.appendChild( head );
  223. html.appendChild( doc.createElement("body") );
  224. doc.appendChild( html );
  225. })();
  226. // Find all the unique elements
  227. if ( doc.getElementsByTagName )
  228. for ( var i in one )
  229. one[ i ] = doc.getElementsByTagName( i )[0];
  230. // If we're working with a document, inject contents into
  231. // the body element
  232. var curParentNode = one.body;
  233. HTMLParser( html, {
  234. start: function( tagName, attrs, unary ) {
  235. // If it's a pre-built element, then we can ignore
  236. // its construction
  237. if ( one[ tagName ] ) {
  238. curParentNode = one[ tagName ];
  239. if ( !unary ) {
  240. elems.push( curParentNode );
  241. }
  242. return;
  243. }
  244. var elem = doc.createElement( tagName );
  245. for ( var attr in attrs )
  246. elem.setAttribute( attrs[ attr ].name, attrs[ attr ].value );
  247. if ( structure[ tagName ] && typeof one[ structure[ tagName ] ] != "boolean" )
  248. one[ structure[ tagName ] ].appendChild( elem );
  249. else if ( curParentNode && curParentNode.appendChild )
  250. curParentNode.appendChild( elem );
  251. if ( !unary ) {
  252. elems.push( elem );
  253. curParentNode = elem;
  254. }
  255. },
  256. end: function( tag ) {
  257. elems.length -= 1;
  258. // Init the new parentNode
  259. curParentNode = elems[ elems.length - 1 ];
  260. },
  261. chars: function( text ) {
  262. curParentNode.appendChild( doc.createTextNode( text ) );
  263. },
  264. comment: function( text ) {
  265. // create comment node
  266. }
  267. });
  268. return doc;
  269. };
  270.  
  271. function makeMap(str){
  272. var obj = {}, items = str.split(",");
  273. for ( var i = 0; i < items.length; i++ )
  274. obj[ items[i] ] = true;
  275. return obj;
  276. }
  277. })();

QingJ © 2025

镜像随时可能失效,请加Q群300939539或关注我们的公众号极客氢云获取最新地址