如何檢查HTML字符串是否安全？

在我的應用程序中，我需要以字符串形式發送和接收HTML。我想保持安全，因此我需要檢查字符串中的dom元素是否匹配允許的標籤以及樣式聲明是否有效，以及是否沒有注入腳本。想到的第一件事當然是將字符串重新編排，但這很乏味，可能是錯誤的，肯定效率低下。第二個想法是用一種叫做XPath的，但即使我讀過關於MDN網站的一些材料，我仍然不知道如何實現此示例代碼：如何檢查HTML字符串是否安全？

const XPathResult   = Components.interfaces.nsIDOMXPathResult; 

const ALLOWED_TAGS   = ['div', 'span', 'b', 'i', 'u', 'br', 'font', 'img']; 
const ALLOWED_STYLES  = ['font-weight', 'font-size', 'font-family', 'text-decoration', 'color', 'background-color']; 
const ALLOWED_ATTRIBUTES = ['style', 'name']; 

const XPATH_PART_TAGS = ALLOWED_TAGS.map(function (v) { 
    return "name() != '" + v + "' and name() != '" + v.toUpperCase() + "'"; // case insensitive 
}).join(' and '); 

const XPATH_PART_ATTRS = ALLOWED_ATTRIBUTES.map(function (v) { 
    return "name() != '" + v + "' and name() != '" + v.toUpperCase() + "'"; // case insensitive 
}).join(' and '); 


const XPATH_BAD_TAGS  = "//*[(namespace-uri() != 'http://www.w3.org/1999/xhtml') or (" + XPATH_PART_TAGS + ")]"; 
const XPATH_BAD_ATTRIBUTES = "//@*[((namespace-uri() != 'http://www.w3.org/1999/xhtml') and (namespace-uri() != '')) or (" + XPATH_PART_ATTRS+ ")]"; 
const XPATH_STYLE   = "//@*[name() = 'style']"; 


/** 
* Checks if inline style definition is considered secure 
* 
* @param {String} styleValue value of style attribute 
* @return bool 
*/ 
function isStyleSecure(styleValue) { 
    var styles = styleValue.split(';'), 
     style, 
     name, value, 
     i, l; 
    for (i = 0, l = styles.length; i < l; i++) { 
     style = styles[i].trim(); 
     if (style === '') { 
      continue; 
     } 
     style = style.split(':', 2); 
     if (style.length !== 2) { 
      return false; 
     } 
     name = style[0].trim().toLowerCase(); 
     value = style[1].trim(); 

     if (ALLOWED_STYLES.indexOf(name) === -1) { 
      return false; 
     } 
    } 
    return true; 
} 

/** 
* Singleton that verifies if given XHTML document fragment is considered secure. 
* Uses whitelist-based checks on tag names, attribute names and document namespaces. 
* 
* @class 
* @namespace core.SecurityFilter.MessageSecurityFilter 
*/ 
var MessageSecurityFilter = { 
    /** 
    * Checks if given document fragment is safe 
    * 
    * @param {nsIDOMElement} element root element of the XHTML document fragment to analyze 
    * @return {bool} true if fragment is safe, false otherwise 
    */ 
    isSecure: function SecurityFilter_isSecure(element) { 
     var document = element.ownerDocument, 
      result, 
      attr; 

     result = document.evaluate('//*', element, null, XPathResult.ANY_TYPE, null); 

     result = document.evaluate(XPATH_BAD_TAGS, element, null, XPathResult.ANY_TYPE, null); 
     if (result.iterateNext()) { 
      return false; 
     } 
     result = document.evaluate(XPATH_BAD_ATTRIBUTES, element, null, XPathResult.ANY_TYPE, null); 
     if ((attr = result.iterateNext())) { 
      return false; 
     } 

     result = document.evaluate(XPATH_STYLE, element, null, XPathResult.ANY_TYPE, null); 
     while ((attr = result.iterateNext())) { 
      if (!isStyleSecure(attr.nodeValue)) { 
       return false; 
      } 
     } 

     return true; 
    } 

};

而且最初的想法是創建DocumentFragment的，並然後使用treeWalker檢查它的節點，或者使用.firstChild等跟隨dom樹。但是我想這個解決方案是不安全的，因爲它會讓我對所有注入的腳本都打開。我對嗎？

有沒有其他辦法？

來源

2011-09-05 mike_hornbeck

請確保你在服務器端做同樣的事情... –

編寫你自己的HTML淨化器的問題大多數時候人們試圖擦除有效的HTML，但瀏覽器設置爲處理無效的SGML，所以存在很多漏洞，用戶可能上傳的東西看起來像無效的HTML，但實際上起作用，而淨化器從未抓到。最好採用已經在社區中的一個，如果您覺得自己找到了漏洞，請提交更新 - 幫助所有人解決問題。 – vol7ron