The below C# .net code helps you to clean your HTML which you may have copied from Microsoft word. This .Net code will remove the unwanted XML and other word specific symbols and gives you a clean and tidy HTML which you can then easily convert to XHTML. In short this C# code will help you to convert MS Word specific HTML document to HTML and there by XHTML
static string CleanHtml(string html)
{
StringCollection sc = new StringCollection();
html = Regex.Replace(html, @"<(o:p|\/o:p)>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled|RegexOptions.Compiled);
html = Regex.Replace(html, @"\s*mso-[^:]+:[^;\x22]+;?", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"\s*MARGIN-\w{3,6}:\s*0\w{2}\s*;?", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"\s*MARGIN\s*:(\s*0\w{2}){3}\s*;?", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"\s*TEXT-INDENT: 0.{2}\s*;", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = html.Replace("
", "
");
", "
");
html = html.Replace(" ", "\t");
html = html.Replace("align=\"middle\"", "align=\"center\"");
Application.DoEvents();
html = Regex.Replace(html, @"<\\?\?xml[^>]*>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"<\/?\w+:[^>]*>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
//html = Regex.Replace(html, @", "$1", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"-->\s*\r*( )", "$1", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"(<\w[^>]*style=)(')([^ |>]*)'", "$1$3$5$7", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"(font-family:)\s*(\x22)([\w\s-]*)\s*(\x22)(,)\s*(\x22)([\w\s-]*)\s*(\x22)(,)\s*(\x22)([\w\s-]*)\s*(\x22)(,)\s*(\x22)([\w\s-]*)\s*(\x22)", "$1$3$5$7$9$11$13$15", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"(font-family:)\s*(\x22)([\w\s-]*)\s*(\x22)(,)\s*(\x22)([\w\s-]*)\s*(\x22)(,)\s*(\x22)([\w\s-]*)\s*(\x22)", "$1$3$5$7$9$11",RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"(font-family:)\s*\x22([\w\s-]*)\s*\x22(,)\s*\x22([\w\s-]*)\s*\x22", "$1$2$3$4",RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"(font-family:)\s*(\x22)([\w\s-]*)\s*(\x22)", "$1$3", RegexOptions.IgnoreCase|RegexOptions.Compiled);
Application.DoEvents();
html = html.Replace("·", "\xB7");
if (Quotes)
{
html = Regex.Replace(html, @"(
]*)([^/]>)\s*()?", "$1\"/>", RegexOptions.IgnoreCase|RegexOptions.Compiled);
}
else
{
html = Regex.Replace(html, @"(
]*)([^/]>)\s*()?", "$1/>", RegexOptions.IgnoreCase|RegexOptions.Compiled);
}
html = Regex.Replace(html, @"(]*)([^/]>)\s*()?", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"Style\s*=\x22?\s*mso-?\w*\s*\x22?", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"]*>\s*", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"]*>\s*", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"]*>\s*", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"]*>\s*", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"]*>\s*", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"]*>\s*" , "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"]*>\s*", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"]*>\s*", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"]*/>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"]*/>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"(style\s*=\s*\x22[^\x22]*)(\s*\x22\s*Style\s*=\s*\x22)", "$1;", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"Class\s*=\x22?\s*\d*-\d*\s*\x22?", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"()([\s\w]*)()", "$2", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = html.Replace("®", "®");
html = html.Replace("©", "©");
html = html.Replace("™", "™");
return html;
}
0 comments:
Post a Comment