[C#] - 从 HTML 代码中 转换 / 提取 可读文字(PlainText)的方法
生活随笔
收集整理的這篇文章主要介紹了
[C#] - 从 HTML 代码中 转换 / 提取 可读文字(PlainText)的方法
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
背景
在做網頁數據分析的時候,我們關注的部分是內容,可以過濾掉HTML標簽、Javascript、CSS等代碼。
目標輸入
<b>Hello World.</b><br/><p><i>Is there anyone out there?</i><p>輸出結果
Hello World. Is there anyone out there?開發工具
Html Agility Pack
http://html-agility-pack.net/
實現方案1:(過濾規則嚴謹,保留HTML版式,推薦使用!)
//small but important modification to class https://github.com/zzzprojects/html-agility-pack/blob/master/src/Samples/Html2Txt/HtmlConvert.cs public static class HtmlToText {public static string Convert(string path){HtmlDocument doc = new HtmlDocument();doc.Load(path);return ConvertDoc(doc);}public static string ConvertHtml(string html){HtmlDocument doc = new HtmlDocument();doc.LoadHtml(html);return ConvertDoc(doc);}public static string ConvertDoc (HtmlDocument doc){using (StringWriter sw = new StringWriter()){ConvertTo(doc.DocumentNode, sw);sw.Flush();return sw.ToString();}}internal static void ConvertContentTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo){foreach (HtmlNode subnode in node.ChildNodes){ConvertTo(subnode, outText, textInfo);}}public static void ConvertTo(HtmlNode node, TextWriter outText){ConvertTo(node, outText, new PreceedingDomTextInfo(false));}internal static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo){string html;switch (node.NodeType){case HtmlNodeType.Comment:// don't output commentsbreak;case HtmlNodeType.Document:ConvertContentTo(node, outText, textInfo);break;case HtmlNodeType.Text:// script and style must not be outputstring parentName = node.ParentNode.Name;if ((parentName == "script") || (parentName == "style")){break;}// get texthtml = ((HtmlTextNode)node).Text;// is it in fact a special closing node output as text?if (HtmlNode.IsOverlappedClosingElement(html)){break;}// check the text is meaningful and not a bunch of whitespacesif (html.Length == 0){break;}if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace){html= html.TrimStart();if (html.Length == 0) { break; }textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true;}outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " ")));if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1])){outText.Write(' ');}break;case HtmlNodeType.Element:string endElementString = null;bool isInline;bool skip = false;int listIndex = 0;switch (node.Name){case "nav":skip = true;isInline = false;break;case "body":case "section":case "article":case "aside":case "h1":case "h2":case "header":case "footer":case "address":case "main":case "div":case "p": // stylistic - adjust as you tend to useif (textInfo.IsFirstTextOfDocWritten){outText.Write("\r\n");}endElementString = "\r\n";isInline = false;break;case "br":outText.Write("\r\n");skip = true;textInfo.WritePrecedingWhiteSpace = false;isInline = true;break;case "a":if (node.Attributes.Contains("href")){string href = node.Attributes["href"].Value.Trim();if (node.InnerText.IndexOf(href, StringComparison.InvariantCultureIgnoreCase)==-1){endElementString = "<" + href + ">";} }isInline = true;break;case "li": if(textInfo.ListIndex>0){outText.Write("\r\n{0}.\t", textInfo.ListIndex++); }else{outText.Write("\r\n*\t"); //using '*' as bullet char, with tab after, but whatever you want eg "\t->", if utf-8 0x2022 }isInline = false;break;case "ol": listIndex = 1;goto case "ul";case "ul": //not handling nested lists any differently at this stage - that is getting close to rendering problemsendElementString = "\r\n";isInline = false;break;case "img": //inline-block in realityif (node.Attributes.Contains("alt")){outText.Write('[' + node.Attributes["alt"].Value);endElementString = "]";}if (node.Attributes.Contains("src")){outText.Write('<' + node.Attributes["src"].Value + '>');}isInline = true;break;default:isInline = true;break;}if (!skip && node.HasChildNodes){ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo(textInfo.IsFirstTextOfDocWritten){ ListIndex = listIndex });}if (endElementString != null){outText.Write(endElementString);}break;}} } internal class PreceedingDomTextInfo {public PreceedingDomTextInfo(BoolWrapper isFirstTextOfDocWritten){IsFirstTextOfDocWritten = isFirstTextOfDocWritten;}public bool WritePrecedingWhiteSpace {get;set;}public bool LastCharWasSpace { get; set; }public readonly BoolWrapper IsFirstTextOfDocWritten;public int ListIndex { get; set; } } internal class BoolWrapper {public BoolWrapper() { }public bool Value { get; set; }public static implicit operator bool(BoolWrapper boolWrapper){return boolWrapper.Value;}public static implicit operator BoolWrapper(bool boolWrapper){return new BoolWrapper{ Value = boolWrapper };} }實現方案2:(過濾規則不嚴謹,適用于結構簡單的HTML)
public static string StripHTML(string HTMLText, bool decode = true) {Regex reg = new Regex("<[^>]+>", RegexOptions.IgnoreCase);var stripped = reg.Replace(HTMLText, "");return decode ? HttpUtility.HtmlDecode(stripped) : stripped; }參考資料
https://stackoverflow.com/a/25178738
https://stackoverflow.com/a/732110
轉載于:https://www.cnblogs.com/jinzesudawei/p/8713497.html
總結
以上是生活随笔為你收集整理的[C#] - 从 HTML 代码中 转换 / 提取 可读文字(PlainText)的方法的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Selenium WebDriver-
- 下一篇: 122 Best Time to Buy