當前位置：首頁 > 编程语言 > C# >内容正文

C#

[C#] - 从 HTML 代码中转换 / 提取可读文字（PlainText）的方法

發布時間：2025/3/15 C# 28 豆豆

生活随笔收集整理的這篇文章主要介紹了 [C#] - 从 HTML 代码中转换 / 提取可读文字（PlainText）的方法小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

背景

在做網頁數據分析的時候，我們關注的部分是內容，可以過濾掉HTML標簽、Javascript、CSS等代碼。

目標輸入

Hello World. Is there anyone out there?

輸出結果

Hello World. Is there anyone out there?

開發工具

Html Agility Pack
http://html-agility-pack.net/

實現方案1：（過濾規則嚴謹，保留HTML版式，推薦使用！）

//small but important modification to class https://github.com/zzzprojects/html-agility-pack/blob/master/src/Samples/Html2Txt/HtmlConvert.cs public static class HtmlToText {public static string Convert(string path){HtmlDocument doc = new HtmlDocument();doc.Load(path);return ConvertDoc(doc);}public static string ConvertHtml(string html){HtmlDocument doc = new HtmlDocument();doc.LoadHtml(html);return ConvertDoc(doc);}public static string ConvertDoc (HtmlDocument doc){using (StringWriter sw = new StringWriter()){ConvertTo(doc.DocumentNode, sw);sw.Flush();return sw.ToString();}}internal static void ConvertContentTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo){foreach (HtmlNode subnode in node.ChildNodes){ConvertTo(subnode, outText, textInfo);}}public static void ConvertTo(HtmlNode node, TextWriter outText){ConvertTo(node, outText, new PreceedingDomTextInfo(false));}internal static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo){string html;switch (node.NodeType){case HtmlNodeType.Comment:// don't output commentsbreak;case HtmlNodeType.Document:ConvertContentTo(node, outText, textInfo);break;case HtmlNodeType.Text:// script and style must not be outputstring parentName = node.ParentNode.Name;if ((parentName == "script") || (parentName == "style")){break;}// get texthtml = ((HtmlTextNode)node).Text;// is it in fact a special closing node output as text?if (HtmlNode.IsOverlappedClosingElement(html)){break;}// check the text is meaningful and not a bunch of whitespacesif (html.Length == 0){break;}if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace){html= html.TrimStart();if (html.Length == 0) { break; }textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true;}outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " ")));if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1])){outText.Write(' ');}break;case HtmlNodeType.Element:string endElementString = null;bool isInline;bool skip = false;int listIndex = 0;switch (node.Name){case "nav":skip = true;isInline = false;break;case "body":case "section":case "article":case "aside":case "h1":case "h2":case "header":case "footer":case "address":case "main":case "div":case "p": // stylistic - adjust as you tend to useif (textInfo.IsFirstTextOfDocWritten){outText.Write("\r\n");}endElementString = "\r\n";isInline = false;break;case "br":outText.Write("\r\n");skip = true;textInfo.WritePrecedingWhiteSpace = false;isInline = true;break;case "a":if (node.Attributes.Contains("href")){string href = node.Attributes["href"].Value.Trim();if (node.InnerText.IndexOf(href, StringComparison.InvariantCultureIgnoreCase)==-1){endElementString = "<" + href + ">";} }isInline = true;break;case "li": if(textInfo.ListIndex>0){outText.Write("\r\n{0}.\t", textInfo.ListIndex++); }else{outText.Write("\r\n*\t"); //using '*' as bullet char, with tab after, but whatever you want eg "\t->", if utf-8 0x2022 }isInline = false;break;case "ol": listIndex = 1;goto case "ul";case "ul": //not handling nested lists any differently at this stage - that is getting close to rendering problemsendElementString = "\r\n";isInline = false;break;case "img": //inline-block in realityif (node.Attributes.Contains("alt")){outText.Write('[' + node.Attributes["alt"].Value);endElementString = "]";}if (node.Attributes.Contains("src")){outText.Write('<' + node.Attributes["src"].Value + '>');}isInline = true;break;default:isInline = true;break;}if (!skip && node.HasChildNodes){ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo(textInfo.IsFirstTextOfDocWritten){ ListIndex = listIndex });}if (endElementString != null){outText.Write(endElementString);}break;}} } internal class PreceedingDomTextInfo {public PreceedingDomTextInfo(BoolWrapper isFirstTextOfDocWritten){IsFirstTextOfDocWritten = isFirstTextOfDocWritten;}public bool WritePrecedingWhiteSpace {get;set;}public bool LastCharWasSpace { get; set; }public readonly BoolWrapper IsFirstTextOfDocWritten;public int ListIndex { get; set; } } internal class BoolWrapper {public BoolWrapper() { }public bool Value { get; set; }public static implicit operator bool(BoolWrapper boolWrapper){return boolWrapper.Value;}public static implicit operator BoolWrapper(bool boolWrapper){return new BoolWrapper{ Value = boolWrapper };} }

實現方案2：（過濾規則不嚴謹，適用于結構簡單的HTML）

public static string StripHTML(string HTMLText, bool decode = true) {Regex reg = new Regex("<[^>]+>", RegexOptions.IgnoreCase);var stripped = reg.Replace(HTMLText, "");return decode ? HttpUtility.HtmlDecode(stripped) : stripped; }

參考資料

https://stackoverflow.com/a/25178738
https://stackoverflow.com/a/732110

轉載于:https://www.cnblogs.com/jinzesudawei/p/8713497.html

總結

以上是生活随笔為你收集整理的[C#] - 从 HTML 代码中转换 / 提取可读文字（PlainText）的方法的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇： Selenium WebDriver-
下一篇： 122 Best Time to Buy

3atv精品不卡视频,97人人超碰国产精品最新,中文字幕av一区二区三区人妻少妇,久久久精品波多野结衣,日韩一区二区三区精品

C#

[C#] - 从 HTML 代码中 转换 / 提取 可读文字（PlainText）的方法

總結

[C#] - 从 HTML 代码中转换 / 提取可读文字（PlainText）的方法