呵呵,显然我ALMOST在某人发的博客文章中找到了答案。
using System.Collections.Generic;using System.Linq;using HtmlAgilityPack;namespace Wayloop.Blog.Core.Markup{ public static class HtmlSanitizer { private static readonly IDictionary<string, string[]> Whitelist; static HtmlSanitizer() { Whitelist = new Dictionary<string, string[]> { { "a", new[] { "href" } }, { "strong", null }, { "em", null }, { "blockquote", null }, }; } public static string Sanitize(string input) { var htmldocument = new Htmldocument(); htmldocument.LoadHtml(input); SanitizeNode(htmldocument.documentNode); return htmldocument.documentNode.WriteTo().Trim(); } private static void SanitizeChildren(HtmlNode parentNode) { for (int i = parentNode.ChildNodes.Count - 1; i >= 0; i--) { SanitizeNode(parentNode.ChildNodes[i]); } } private static void SanitizeNode(HtmlNode node) { if (node.NodeType == HtmlNodeType.Element) { if (!Whitelist.ContainsKey(node.Name)) { node.ParentNode.RemoveChild(node); return; } if (node.HasAttributes) { for (int i = node.Attributes.Count - 1; i >= 0; i--) { HtmlAttribute currentAttribute = node.Attributes[i]; string[] allowedAttributes = Whitelist[node.Name]; if (!allowedAttributes.Contains(currentAttribute.Name)) { node.Attributes.Remove(currentAttribute); } } } } if (node.HasChildNodes) { SanitizeChildren(node); } } }}
我从这里得到了HtmlSanitizer 显然,它不会剥离标签,但会一起删除元素。
好的,这是以后需要它的人的解决方案。
public static class HtmlSanitizer { private static readonly IDictionary<string, string[]> Whitelist; private static List<string> DeletableNodesXpath = new List<string>(); static HtmlSanitizer() { Whitelist = new Dictionary<string, string[]> { { "a", new[] { "href" } }, { "strong", null }, { "em", null }, { "blockquote", null }, { "b", null}, { "p", null}, { "ul", null}, { "ol", null}, { "li", null}, { "div", new[] { "align" } }, { "strike", null}, { "u", null}, { "sub", null}, { "sup", null}, { "table", null }, { "tr", null }, { "td", null }, { "th", null } }; } public static string Sanitize(string input) { if (input.Trim().Length < 1) return string.Empty; var htmldocument = new Htmldocument(); htmldocument.LoadHtml(input); SanitizeNode(htmldocument.documentNode); string xPath = HtmlSanitizer.CreateXPath(); return StripHtml(htmldocument.documentNode.WriteTo().Trim(), xPath); } private static void SanitizeChildren(HtmlNode parentNode) { for (int i = parentNode.ChildNodes.Count - 1; i >= 0; i--) { SanitizeNode(parentNode.ChildNodes[i]); } } private static void SanitizeNode(HtmlNode node) { if (node.NodeType == HtmlNodeType.Element) { if (!Whitelist.ContainsKey(node.Name)) { if (!DeletableNodesXpath.Contains(node.Name)) { //DeletableNodesXpath.Add(node.Name.Replace("?","")); node.Name = "removeableNode"; DeletableNodesXpath.Add(node.Name); } if (node.HasChildNodes) { SanitizeChildren(node); } return; } if (node.HasAttributes) { for (int i = node.Attributes.Count - 1; i >= 0; i--) { HtmlAttribute currentAttribute = node.Attributes[i]; string[] allowedAttributes = Whitelist[node.Name]; if (allowedAttributes != null) { if (!allowedAttributes.Contains(currentAttribute.Name)) { node.Attributes.Remove(currentAttribute); } } else { node.Attributes.Remove(currentAttribute); } } } } if (node.HasChildNodes) { SanitizeChildren(node); } } private static string StripHtml(string html, string xPath) { Htmldocument htmlDoc = new Htmldocument(); htmlDoc.LoadHtml(html); if (xPath.Length > 0) { HtmlNodeCollection invalidNodes = htmlDoc.documentNode.SelectNodes(@xPath); foreach (HtmlNode node in invalidNodes) { node.ParentNode.RemoveChild(node, true); } } return htmlDoc.documentNode.WriteContentTo(); ; } private static string CreateXPath() { string _xPath = string.Empty; for (int i = 0; i < DeletableNodesXpath.Count; i++) { if (i != DeletableNodesXpath.Count - 1) { _xPath += string.Format("//{0}|", DeletableNodesXpath[i].ToString()); } else _xPath += string.Format("//{0}", DeletableNodesXpath[i].ToString()); } return _xPath; } }
我重命名了该节点,因为如果必须解析XML名称空间节点,它将在xpath解析时崩溃。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)