HTML in Plain-Text konvertieren

UweKeim · 29. Januar 2015 um 15:26

Hier ein Code-Schnipsel um HTML-Code so in reinen Text umzuwandeln, dass ein halbwegs menschenlesbarer Text rauskommt:

public static string ConvertHtmlToPlainText(string html)
{
	// http://pastebin.com/NswerNkQ
	// http://stackoverflow.com/questions/8419517/convert-html-to-plain-text-while-preserving-p-br-ul-ol
	// http://www.codeproject.com/KB/HTML/HTML_to_Plain_Text.aspx

	// Remove HTML Development formatting
	// Replace line breaks with space
	// because browsers inserts space
	// ReSharper disable LocalizableElement
	var result = html.Replace("\r", @" ");
	// Replace line breaks with space
	// because browsers inserts space
	result = result.Replace("\n", @" ");
	// Remove step-formatting
	result = result.Replace("\t", string.Empty);
	// ReSharper restore LocalizableElement
	// Remove repeating spaces because browsers ignore them
	result = Regex.Replace(result, @"(\s)+", " ", RegexOptions.Singleline);

	// Remove the header (prepare first by clearing attributes)
	result = Regex.Replace(result,
				@"<( )*head([^>])*>", @"<head>",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"(<( )*(/)( )*head( )*>)", @"</head>",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"(<head>).*(</head>)", string.Empty,
				RegexOptions.IgnoreCase);

	// remove all scripts (prepare first by clearing attributes)
	result = Regex.Replace(result,
				@"<( )*script([^>])*>", "<script>",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"(<( )*(/)( )*script( )*>)", "</script>",
				RegexOptions.IgnoreCase);
	//result = Regex.Replace(result,
	//         @"(<script>)([^(<script>\.</script>)])*(</script>)",
	//         string.Empty,
	//         RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"(<script>).*(</script>)", string.Empty,
				RegexOptions.IgnoreCase);

	// remove all styles (prepare first by clearing attributes)
	result = Regex.Replace(result,
				@"<( )*style([^>])*>", @"<style>",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"(<( )*(/)( )*style( )*>)", @"</style>",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"(<style>).*(</style>)", string.Empty,
				RegexOptions.IgnoreCase);

	// insert tabs in spaces of <td> tags
	result = Regex.Replace(result,
				@"<( )*td([^>])*>", "\t",
				RegexOptions.IgnoreCase);

	// insert line breaks in places of <BR> and <LI> tags
	result = Regex.Replace(result,
				@"<( )*br( )*>", "\r",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"<( )*li( )*>", "\r- ",
				RegexOptions.IgnoreCase);

	// insert line paragraphs (double line breaks) in place
	// if <P>, <DIV> and <TR> tags
	result = Regex.Replace(result,
				@"<( )*div([^>])*>", "\r\r",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"<( )*tr([^>])*>", "\r\r",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"<( )*p([^>])*>", "\r\r",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"<( )*ol([^>])*>", "\r\r",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"<( )*ul([^>])*>", "\r\r",
				RegexOptions.IgnoreCase);

	// Remove remaining tags like <a>, links, images,
	// comments etc - anything that's enclosed inside < >
	result = Regex.Replace(result,
				@"<[^>]*>", string.Empty,
				RegexOptions.IgnoreCase);

	// replace special characters:
	result = Regex.Replace(result,
				@" ", " ",
				RegexOptions.IgnoreCase);

	result = Regex.Replace(result,
				@"&bull;", " * ",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"&lsaquo;", "<",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"&rsaquo;", ">",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"&trade;", "(tm)",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"&frasl;", "/",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"&lt;", "<",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"&gt;", ">",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"&copy;", "(c)",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				@"&reg;", "(r)",
				RegexOptions.IgnoreCase);
	// Remove all others. More can be added, see
	// http://hotwired.lycos.com/webmonkey/reference/special_characters/
	result = Regex.Replace(result,
				@"&(.{2,6});", string.Empty,
				RegexOptions.IgnoreCase);

	// for testing
	//Regex.Replace(result,
	//       this.txtRegex.Text,string.Empty,
	//       RegexOptions.IgnoreCase);

	// make line breaking consistent
// ReSharper disable LocalizableElement
	result = result.Replace("\n", "\r");
// ReSharper restore LocalizableElement

	// Remove extra line breaks and tabs:
	// replace over 2 breaks with 2 and over 4 tabs with 4.
	// Prepare first to remove any whitespaces in between
	// the escaped characters and remove redundant tabs in between line breaks
	result = Regex.Replace(result,
				"(\r)( )+(\r)", "\r\r",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				"(\t)( )+(\t)", "\t\t",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				"(\t)( )+(\r)", "\t\r",
				RegexOptions.IgnoreCase);
	result = Regex.Replace(result,
				"(\r)( )+(\t)", "\r\t",
				RegexOptions.IgnoreCase);
	// Remove redundant tabs
	result = Regex.Replace(result,
				"(\r)(\t)+(\r)", "\r\r",
				RegexOptions.IgnoreCase);
	// Remove multiple tabs following a line break with just one tab
	result = Regex.Replace(result,
				"(\r)(\t)+", "\r\t",
				RegexOptions.IgnoreCase);
	// Initial replacement target string for line breaks
// ReSharper disable LocalizableElement
	var breaks = "\r\r\r";
// ReSharper restore LocalizableElement
	// Initial replacement target string for tabs
// ReSharper disable LocalizableElement
	var tabs = "\t\t\t\t\t";
// ReSharper restore LocalizableElement
	for (var index = 0; index < result.Length; index++)
	{
// ReSharper disable LocalizableElement
		result = result.Replace(breaks, "\r\r");
		result = result.Replace(tabs, "\t\t\t\t");
		breaks = breaks + "\r";
		tabs = tabs + "\t";
		// ReSharper restore LocalizableElement
	}

	// UK: Space at the beginning.
// ReSharper disable LocalizableElement
	result = result.Replace("\r ", "\r");
// ReSharper restore LocalizableElement

	// UK: Normalize.
// ReSharper disable LocalizableElement
	result = result.Replace("\r", Environment.NewLine);
// ReSharper restore LocalizableElement

	// That's it.
	return result.Trim();
}

Das Original habe ich seinerzeit auf Pastebin veröffentlicht.

fibonacci · 7. Februar 2015 um 20:07

HTML sollte nie mit regulären Ausdrücken behandelt werden. Das Ergebnis ist gelinde gesagt bescheiden und im Allgemeinen auch nicht robust. Lieber eine vernünftige Library hernehmen…

UweKeim · 7. Februar 2015 um 20:28

Danke dir. Der Grund für deine zutreffende Aussage ist glaube ich, wenn ich mich recht erinnere, dass regex gar nicht den Semantik-Level von html/Xml abdecken kann.

Siehe dazu auch:

Und ein Blog-Beitrag vom berühmten Jeff Atwood dazu:

Bestes Zitat daraus:

Even Jon Skeet cannot parse HTML using regular expressions.

fibonacci · 7. Februar 2015 um 21:27

Absolut richtig. [Chomsky Type 2 vs Chomsky Type 3 beschreibt die Problematik am treffensten]