Hier ein Code-Schnipsel um HTML-Code so in reinen Text umzuwandeln, dass ein halbwegs menschenlesbarer Text rauskommt:
public static string ConvertHtmlToPlainText(string html)
{
// http://pastebin.com/NswerNkQ
// http://stackoverflow.com/questions/8419517/convert-html-to-plain-text-while-preserving-p-br-ul-ol
// http://www.codeproject.com/KB/HTML/HTML_to_Plain_Text.aspx
// Remove HTML Development formatting
// Replace line breaks with space
// because browsers inserts space
// ReSharper disable LocalizableElement
var result = html.Replace("\r", @" ");
// Replace line breaks with space
// because browsers inserts space
result = result.Replace("\n", @" ");
// Remove step-formatting
result = result.Replace("\t", string.Empty);
// ReSharper restore LocalizableElement
// Remove repeating spaces because browsers ignore them
result = Regex.Replace(result, @"(\s)+", " ", RegexOptions.Singleline);
// Remove the header (prepare first by clearing attributes)
result = Regex.Replace(result,
@"<( )*head([^>])*>", @"<head>",
RegexOptions.IgnoreCase);
result = Regex.Replace(result,
@"(<( )*(/)( )*head( )*>)", @"</head>",
RegexOptions.IgnoreCase);
result = Regex.Replace(result,
@"(<head>).*(</head>)", string.Empty,
RegexOptions.IgnoreCase);
// remove all scripts (prepare first by clearing attributes)
result = Regex.Replace(result,
@"<( )*script([^>])*>", "<script>",
RegexOptions.IgnoreCase);
result = Regex.Replace(result,
@"(<( )*(/)( )*script( )*>)", "</script>",
RegexOptions.IgnoreCase);
//result = Regex.Replace(result,
// @"(<script>)([^(<script>\.</script>)])*(</script>)",
// string.Empty,
// RegexOptions.IgnoreCase);
result = Regex.Replace(result,
@"(<script>).*(</script>)", string.Empty,
RegexOptions.IgnoreCase);
// remove all styles (prepare first by clearing attributes)
result = Regex.Replace(result,
@"<( )*style([^>])*>", @"<style>",
RegexOptions.IgnoreCase);
result = Regex.Replace(result,
@"(<( )*(/)( )*style( )*>)", @"</style>",
RegexOptions.IgnoreCase);
result = Regex.Replace(result,
@"(<style>).*(</style>)", string.Empty,
RegexOptions.IgnoreCase);
// insert tabs in spaces of <td> tags
result = Regex.Replace(result,
@"<( )*td([^>])*>", "\t",
RegexOptions.IgnoreCase);
// insert line breaks in places of <BR> and <LI> tags
result = Regex.Replace(result,
@"<( )*br( )*>", "\r",
RegexOptions.IgnoreCase);
result = Regex.Replace(result,
@"<( )*li( )*>", "\r- ",
RegexOptions.IgnoreCase);
// insert line paragraphs (double line breaks) in place
// if <P>, <DIV> and <TR> tags
result = Regex.Replace(result,
@"<( )*div([^>])*>", "\r\r",
RegexOptions.IgnoreCase);
result = Regex.Replace(result,
@"<( )*tr([^>])*>", "\r\r",
RegexOptions.IgnoreCase);
result = Regex.Replace(result,
@"<( )*p([^>])*>", "\r\r",
RegexOptions.IgnoreCase);
result = Regex.Replace(result,
@"<( )*ol([^>])*>", "\r\r",
RegexOptions.IgnoreCase);
result = Regex.Replace(result,
@"<( )*ul([^>])*>", "\r\r",
RegexOptions.IgnoreCase);
// Remove remaining tags like <a>, links, images,
// comments etc - anything that's enclosed inside < >
result = Regex.Replace(result,
@"<[^>]*>", string.Empty,
RegexOptions.IgnoreCase);
// replace special characters:
result = Regex.Replace(result,
@" ", " ",
RegexOptions.IgnoreCase);
result = Regex.Replace(result,
@"•", " * ",
RegexOptions.IgnoreCase);
result = Regex.Replace(result,
@"‹", "<",
RegexOptions.IgnoreCase);
result = Regex.Replace(result,
@"›", ">",
RegexOptions.IgnoreCase);
result = Regex.Replace(result,
@"™", "(tm)",
RegexOptions.IgnoreCase);
result = Regex.Replace(result,
@"⁄", "/",
RegexOptions.IgnoreCase);
result = Regex.Replace(result,
@"<", "<",
RegexOptions.IgnoreCase);
result = Regex.Replace(result,
@">", ">",
RegexOptions.IgnoreCase);
result = Regex.Replace(result,
@"©", "(c)",
RegexOptions.IgnoreCase);
result = Regex.Replace(result,
@"®", "(r)",
RegexOptions.IgnoreCase);
// Remove all others. More can be added, see
// http://hotwired.lycos.com/webmonkey/reference/special_characters/
result = Regex.Replace(result,
@"&(.{2,6});", string.Empty,
RegexOptions.IgnoreCase);
// for testing
//Regex.Replace(result,
// this.txtRegex.Text,string.Empty,
// RegexOptions.IgnoreCase);
// make line breaking consistent
// ReSharper disable LocalizableElement
result = result.Replace("\n", "\r");
// ReSharper restore LocalizableElement
// Remove extra line breaks and tabs:
// replace over 2 breaks with 2 and over 4 tabs with 4.
// Prepare first to remove any whitespaces in between
// the escaped characters and remove redundant tabs in between line breaks
result = Regex.Replace(result,
"(\r)( )+(\r)", "\r\r",
RegexOptions.IgnoreCase);
result = Regex.Replace(result,
"(\t)( )+(\t)", "\t\t",
RegexOptions.IgnoreCase);
result = Regex.Replace(result,
"(\t)( )+(\r)", "\t\r",
RegexOptions.IgnoreCase);
result = Regex.Replace(result,
"(\r)( )+(\t)", "\r\t",
RegexOptions.IgnoreCase);
// Remove redundant tabs
result = Regex.Replace(result,
"(\r)(\t)+(\r)", "\r\r",
RegexOptions.IgnoreCase);
// Remove multiple tabs following a line break with just one tab
result = Regex.Replace(result,
"(\r)(\t)+", "\r\t",
RegexOptions.IgnoreCase);
// Initial replacement target string for line breaks
// ReSharper disable LocalizableElement
var breaks = "\r\r\r";
// ReSharper restore LocalizableElement
// Initial replacement target string for tabs
// ReSharper disable LocalizableElement
var tabs = "\t\t\t\t\t";
// ReSharper restore LocalizableElement
for (var index = 0; index < result.Length; index++)
{
// ReSharper disable LocalizableElement
result = result.Replace(breaks, "\r\r");
result = result.Replace(tabs, "\t\t\t\t");
breaks = breaks + "\r";
tabs = tabs + "\t";
// ReSharper restore LocalizableElement
}
// UK: Space at the beginning.
// ReSharper disable LocalizableElement
result = result.Replace("\r ", "\r");
// ReSharper restore LocalizableElement
// UK: Normalize.
// ReSharper disable LocalizableElement
result = result.Replace("\r", Environment.NewLine);
// ReSharper restore LocalizableElement
// That's it.
return result.Trim();
}
Das Original habe ich seinerzeit auf Pastebin veröffentlicht.