瀏覽器已在o.node
中有一個非常好的解析HTML樹。將文檔內容序列化爲HTML(使用innerHTML
),嘗試使用正則表達式(其中不能可靠地解析HTML),然後通過設置innerHTML
將結果重新解析爲文檔內容...實際上有點不正確。
相反,請檢查您已在o.node
內部擁有的元素和屬性節點,刪除不想要的元素和屬性節點,例如。:
filterNodes(o.node, {p: [], br: [], a: ['href']});
定義爲:
// Remove elements and attributes that do not meet a whitelist lookup of lowercase element
// name to list of lowercase attribute names.
//
function filterNodes(element, allow) {
// Recurse into child elements
//
Array.fromList(element.childNodes).forEach(function(child) {
if (child.nodeType===1) {
filterNodes(child, allow);
var tag= child.tagName.toLowerCase();
if (tag in allow) {
// Remove unwanted attributes
//
Array.fromList(child.attributes).forEach(function(attr) {
if (allow[tag].indexOf(attr.name.toLowerCase())===-1)
child.removeAttributeNode(attr);
});
} else {
// Replace unwanted elements with their contents
//
while (child.firstChild)
element.insertBefore(child.firstChild, child);
element.removeChild(child);
}
}
});
}
// ECMAScript Fifth Edition (and JavaScript 1.6) array methods used by `filterNodes`.
// Because not all browsers have these natively yet, bodge in support if missing.
//
if (!('indexOf' in Array.prototype)) {
Array.prototype.indexOf= function(find, ix /*opt*/) {
for (var i= ix || 0, n= this.length; i<n; i++)
if (i in this && this[i]===find)
return i;
return -1;
};
}
if (!('forEach' in Array.prototype)) {
Array.prototype.forEach= function(action, that /*opt*/) {
for (var i= 0, n= this.length; i<n; i++)
if (i in this)
action.call(that, this[i], i, this);
};
}
// Utility function used by filterNodes. This is really just `Array.prototype.slice()`
// except that the ECMAScript standard doesn't guarantee we're allowed to call that on
// a host object like a DOM NodeList, boo.
//
Array.fromList= function(list) {
var array= new Array(list.length);
for (var i= 0, n= list.length; i<n; i++)
array[i]= list[i];
return array;
};
http://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454 – waiwai933 2010-03-06 16:33:25