HTML in Plain-Text konvertieren

Hier ein Code-Schnipsel um HTML-Code so in reinen Text umzuwandeln, dass ein halbwegs menschenlesbarer Text rauskommt:

public static string ConvertHtmlToPlainText(string html)
{
	// http://pastebin.com/NswerNkQ
	// http://stackoverflow.com/questions/8419517/convert-html-to-plain-text-while-preserving-p-br-ul-ol
	// http://www.codeproject.com/KB/HTML/HTML_to_Plain_Text.aspx

	// Remove HTML Development formatting
	// Replace line breaks with space
	// because browsers inserts space
	// ReSharper disable LocalizableElement
	var result = html.Replace("\r", @" ");
	// Replace line breaks with space
	// because browsers inserts space
	result = result.Replace("\n", @" ");
	// Remove step-formatting
	result = result.Replace("\t", string.Empty);
	// ReSharper restore LocalizableElement
	// Remove repeating spaces because browsers ignore them
	result = Regex.Replace(result, @"(\s)+", " ", RegexOptions.Singleline);

	// Remove the header (prepare first by clearing attributes)
	result = Regex.Replace(result,
				@"<( )*head([^>])*>", @"<head>",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"(<( )*(/)( )*head( )*>)", @"</head>",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"(<head>).*(</head>)", string.Empty,
				RegexOptions.IgnoreCase);

	// remove all scripts (prepare first by clearing attributes)
	result = Regex.Replace(result,
				@"<( )*script([^>])*>", "<script>",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"(<( )*(/)( )*script( )*>)", "</script>",
				RegexOptions.IgnoreCase);
	//result = Regex.Replace(result,
	//         @"(<script>)([^(<script>\.</script>)])*(</script>)",
	//         string.Empty,
	//         RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"(<script>).*(</script>)", string.Empty,
				RegexOptions.IgnoreCase);

	// remove all styles (prepare first by clearing attributes)
	result = Regex.Replace(result,
				@"<( )*style([^>])*>", @"<style>",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"(<( )*(/)( )*style( )*>)", @"</style>",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"(<style>).*(</style>)", string.Empty,
				RegexOptions.IgnoreCase);

	// insert tabs in spaces of <td> tags
	result = Regex.Replace(result,
				@"<( )*td([^>])*>", "\t",
				RegexOptions.IgnoreCase);

	// insert line breaks in places of <BR> and <LI> tags
	result = Regex.Replace(result,
				@"<( )*br( )*>", "\r",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"<( )*li( )*>", "\r- ",
				RegexOptions.IgnoreCase);

	// insert line paragraphs (double line breaks) in place
	// if <P>, <DIV> and <TR> tags
	result = Regex.Replace(result,
				@"<( )*div([^>])*>", "\r\r",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"<( )*tr([^>])*>", "\r\r",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"<( )*p([^>])*>", "\r\r",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"<( )*ol([^>])*>", "\r\r",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"<( )*ul([^>])*>", "\r\r",
				RegexOptions.IgnoreCase);

	// Remove remaining tags like <a>, links, images,
	// comments etc - anything that's enclosed inside < >
	result = Regex.Replace(result,
				@"<[^>]*>", string.Empty,
				RegexOptions.IgnoreCase);

	// replace special characters:
	result = Regex.Replace(result,
				@" ", " ",
				RegexOptions.IgnoreCase);

	result = Regex.Replace(result,
				@"&bull;", " * ",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"&lsaquo;", "<",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"&rsaquo;", ">",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"&trade;", "(tm)",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"&frasl;", "/",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"&lt;", "<",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"&gt;", ">",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"&copy;", "(c)",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"&reg;", "(r)",
				RegexOptions.IgnoreCase);
	// Remove all others. More can be added, see
	// http://hotwired.lycos.com/webmonkey/reference/special_characters/
	result = Regex.Replace(result,
				@"&(.{2,6});", string.Empty,
				RegexOptions.IgnoreCase);

	// for testing
	//Regex.Replace(result,
	//       this.txtRegex.Text,string.Empty,
	//       RegexOptions.IgnoreCase);

	// make line breaking consistent
// ReSharper disable LocalizableElement
	result = result.Replace("\n", "\r");
// ReSharper restore LocalizableElement

	// Remove extra line breaks and tabs:
	// replace over 2 breaks with 2 and over 4 tabs with 4.
	// Prepare first to remove any whitespaces in between
	// the escaped characters and remove redundant tabs in between line breaks
	result = Regex.Replace(result,
				"(\r)( )+(\r)", "\r\r",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				"(\t)( )+(\t)", "\t\t",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				"(\t)( )+(\r)", "\t\r",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				"(\r)( )+(\t)", "\r\t",
				RegexOptions.IgnoreCase);
	// Remove redundant tabs
	result = Regex.Replace(result,
				"(\r)(\t)+(\r)", "\r\r",
				RegexOptions.IgnoreCase);
	// Remove multiple tabs following a line break with just one tab
	result = Regex.Replace(result,
				"(\r)(\t)+", "\r\t",
				RegexOptions.IgnoreCase);
	// Initial replacement target string for line breaks
// ReSharper disable LocalizableElement
	var breaks = "\r\r\r";
// ReSharper restore LocalizableElement
	// Initial replacement target string for tabs
// ReSharper disable LocalizableElement
	var tabs = "\t\t\t\t\t";
// ReSharper restore LocalizableElement
	for (var index = 0; index < result.Length; index++)
	{
// ReSharper disable LocalizableElement
		result = result.Replace(breaks, "\r\r");
		result = result.Replace(tabs, "\t\t\t\t");
		breaks = breaks + "\r";
		tabs = tabs + "\t";
		// ReSharper restore LocalizableElement
	}

	// UK: Space at the beginning.
// ReSharper disable LocalizableElement
	result = result.Replace("\r ", "\r");
// ReSharper restore LocalizableElement

	// UK: Normalize.
// ReSharper disable LocalizableElement
	result = result.Replace("\r", Environment.NewLine);
// ReSharper restore LocalizableElement

	// That's it.
	return result.Trim();
}

Das Original habe ich seinerzeit auf Pastebin veröffentlicht.

HTML sollte nie mit regulären Ausdrücken behandelt werden. Das Ergebnis ist gelinde gesagt bescheiden und im Allgemeinen auch nicht robust. Lieber eine vernünftige Library hernehmen…

1 Like

Danke dir. Der Grund fĂĽr deine zutreffende Aussage ist glaube ich, wenn ich mich recht erinnere, dass regex gar nicht den Semantik-Level von html/Xml abdecken kann.

Siehe dazu auch:

Und ein Blog-Beitrag vom berĂĽhmten Jeff Atwood dazu:

http://blog.codinghorror.com/parsing-html-the-cthulhu-way/

Bestes Zitat daraus:

Even Jon Skeet cannot parse HTML using regular expressions.

Absolut richtig. [Chomsky Type 2 vs Chomsky Type 3 beschreibt die Problematik am treffensten]

1 Like