HTML Agility Pack条标记不在白名单中_随笔

HTML Agility Pack条标记不在白名单中

呵呵，显然我ALMOST在某人发的博客文章中找到了答案。

using System.Collections.Generic;using System.Linq;using HtmlAgilityPack;namespace Wayloop.Blog.Core.Markup{    public static class HtmlSanitizer    {        private static readonly IDictionary<string, string[]> Whitelist;        static HtmlSanitizer()        { Whitelist = new Dictionary<string, string[]> {     { "a", new[] { "href" } },     { "strong", null },     { "em", null },     { "blockquote", null },     };        }        public static string Sanitize(string input)        { var htmldocument = new Htmldocument(); htmldocument.LoadHtml(input); SanitizeNode(htmldocument.documentNode); return htmldocument.documentNode.WriteTo().Trim();        }        private static void SanitizeChildren(HtmlNode parentNode)        { for (int i = parentNode.ChildNodes.Count - 1; i >= 0; i--) {     SanitizeNode(parentNode.ChildNodes[i]); }        }        private static void SanitizeNode(HtmlNode node)        { if (node.NodeType == HtmlNodeType.Element) {     if (!Whitelist.ContainsKey(node.Name)) {         node.ParentNode.RemoveChild(node);         return;     }     if (node.HasAttributes) {         for (int i = node.Attributes.Count - 1; i >= 0; i--) {  HtmlAttribute currentAttribute = node.Attributes[i];  string[] allowedAttributes = Whitelist[node.Name];  if (!allowedAttributes.Contains(currentAttribute.Name)) {      node.Attributes.Remove(currentAttribute);  }         }     } } if (node.HasChildNodes) {     SanitizeChildren(node); }        }    }}

我从这里得到了HtmlSanitizer 显然，它不会剥离标签，但会一起删除元素。

好的，这是以后需要它的人的解决方案。

public static class HtmlSanitizer    {        private static readonly IDictionary<string, string[]> Whitelist;        private static List<string> DeletableNodesXpath = new List<string>();        static HtmlSanitizer()        { Whitelist = new Dictionary<string, string[]> {     { "a", new[] { "href" } },     { "strong", null },     { "em", null },     { "blockquote", null },     { "b", null},     { "p", null},     { "ul", null},     { "ol", null},     { "li", null},     { "div", new[] { "align" } },     { "strike", null},     { "u", null},          { "sub", null},     { "sup", null},     { "table", null },     { "tr", null },     { "td", null },     { "th", null }     };        }        public static string Sanitize(string input)        { if (input.Trim().Length < 1)     return string.Empty; var htmldocument = new Htmldocument(); htmldocument.LoadHtml(input);  SanitizeNode(htmldocument.documentNode); string xPath = HtmlSanitizer.CreateXPath(); return StripHtml(htmldocument.documentNode.WriteTo().Trim(), xPath);        }        private static void SanitizeChildren(HtmlNode parentNode)        { for (int i = parentNode.ChildNodes.Count - 1; i >= 0; i--) {     SanitizeNode(parentNode.ChildNodes[i]); }        }        private static void SanitizeNode(HtmlNode node)        { if (node.NodeType == HtmlNodeType.Element) {     if (!Whitelist.ContainsKey(node.Name))     {         if (!DeletableNodesXpath.Contains(node.Name))         {   //DeletableNodesXpath.Add(node.Name.Replace("?",""));  node.Name = "removeableNode";  DeletableNodesXpath.Add(node.Name);         }         if (node.HasChildNodes)         {  SanitizeChildren(node);         }         return;     }     if (node.HasAttributes)     {         for (int i = node.Attributes.Count - 1; i >= 0; i--)         {  HtmlAttribute currentAttribute = node.Attributes[i];  string[] allowedAttributes = Whitelist[node.Name];  if (allowedAttributes != null)  {      if (!allowedAttributes.Contains(currentAttribute.Name))      {          node.Attributes.Remove(currentAttribute);      }  }  else  {      node.Attributes.Remove(currentAttribute);  }         }     } } if (node.HasChildNodes) {     SanitizeChildren(node); }        }        private static string StripHtml(string html, string xPath)        { Htmldocument htmlDoc = new Htmldocument(); htmlDoc.LoadHtml(html); if (xPath.Length > 0) {     HtmlNodeCollection invalidNodes = htmlDoc.documentNode.SelectNodes(@xPath);     foreach (HtmlNode node in invalidNodes)     {         node.ParentNode.RemoveChild(node, true);     } } return htmlDoc.documentNode.WriteContentTo(); ;        }        private static string CreateXPath()        { string _xPath = string.Empty; for (int i = 0; i < DeletableNodesXpath.Count; i++) {     if (i != DeletableNodesXpath.Count - 1)     {         _xPath += string.Format("//{0}|", DeletableNodesXpath[i].ToString());     }     else _xPath += string.Format("//{0}", DeletableNodesXpath[i].ToString()); } return _xPath;        }    }

我重命名了该节点，因为如果必须解析XML名称空间节点，它将在xpath解析时崩溃。

欢迎分享，转载请注明来源：内存溢出

原文地址: http://outofmemory.cn/zaji/5440885.html

HTML Agility Pack条标记不在白名单中

发表评论

评论列表（0条）