The topics described here concentrates fully on pure .Net Framework, describing actual meaning of some programming concepts, FCL and best practices. However you will be using these concepts in all . Net framework compatible languages like Microsoft visual basic .net (VB.Net) or C# .Net (csharp.Net) to build a web application (Asp.Net) or Desktop applications (winforms .net) or Web/Windows services





This blog has moved!

You will be automatically redirected to the new address. If that does not occur, visit
http://Codemine.net
and update your bookmarks.

Wednesday, September 8, 2010

Converting from MS Word to HTML / XHML


The below C# .net code helps you to clean your HTML which you may have copied from Microsoft word. This .Net code will remove the unwanted XML and other word specific symbols and gives you a clean and tidy HTML which you can then easily convert to XHTML. In short this C# code will help you to convert MS Word specific HTML document to HTML and there by XHTML

static string CleanHtml(string html)
{
     StringCollection sc = new StringCollection();
     html = Regex.Replace(html, @"<(o:p|\/o:p)>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled|RegexOptions.Compiled);
     html = Regex.Replace(html, @"\s*mso-[^:]+:[^;\x22]+;?", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
     html = Regex.Replace(html, @"\s*MARGIN-\w{3,6}:\s*0\w{2}\s*;?", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
     html = Regex.Replace(html, @"\s*MARGIN\s*:(\s*0\w{2}){3}\s*;?", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
     html = Regex.Replace(html, @"\s*TEXT-INDENT: 0.{2}\s*;", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
     html = html.Replace("
"
, "
"
);
     html = html.Replace(" ", "\t");
     html = html.Replace("align=\"middle\"", "align=\"center\"");
     Application.DoEvents();
     html = Regex.Replace(html, @"<\\?\?xml[^>]*>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
    html = Regex.Replace(html, @"<\/?\w+:[^>]*>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
     //html = Regex.Replace(html, @", "$1", RegexOptions.IgnoreCase|RegexOptions.Compiled);
     html = Regex.Replace(html, @"-->\s*\r*( )", "$1", RegexOptions.IgnoreCase|RegexOptions.Compiled);

     html = Regex.Replace(html, @"(<\w[^>]*style=)(')([^ |>]*)'", "$1$3$5$7", RegexOptions.IgnoreCase|RegexOptions.Compiled);

     html = Regex.Replace(html, @"(font-family:)\s*(\x22)([\w\s-]*)\s*(\x22)(,)\s*(\x22)([\w\s-]*)\s*(\x22)(,)\s*(\x22)([\w\s-]*)\s*(\x22)(,)\s*(\x22)([\w\s-]*)\s*(\x22)", "$1$3$5$7$9$11$13$15", RegexOptions.IgnoreCase|RegexOptions.Compiled);
     html = Regex.Replace(html, @"(font-family:)\s*(\x22)([\w\s-]*)\s*(\x22)(,)\s*(\x22)([\w\s-]*)\s*(\x22)(,)\s*(\x22)([\w\s-]*)\s*(\x22)", "$1$3$5$7$9$11",RegexOptions.IgnoreCase|RegexOptions.Compiled);
     html = Regex.Replace(html, @"(font-family:)\s*\x22([\w\s-]*)\s*\x22(,)\s*\x22([\w\s-]*)\s*\x22", "$1$2$3$4",RegexOptions.IgnoreCase|RegexOptions.Compiled);
     html = Regex.Replace(html, @"(font-family:)\s*(\x22)([\w\s-]*)\s*(\x22)", "$1$3", RegexOptions.IgnoreCase|RegexOptions.Compiled);

      Application.DoEvents();
  
     html = html.Replace("·", "\xB7");
   
     if (Quotes)
     {
  html = Regex.Replace(html, @"(]*)([^/]>)\s*()?", "$1\"/>", RegexOptions.IgnoreCase|RegexOptions.Compiled);
     }
     else
     {
  html = Regex.Replace(html, @"(]*)([^/]>)\s*()?", "$1/>", RegexOptions.IgnoreCase|RegexOptions.Compiled);
     }
     html = Regex.Replace(html, @"(]*)([^/]>)\s*()?", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);

     html = Regex.Replace(html, @"Style\s*=\x22?\s*mso-?\w*\s*\x22?", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
     html = Regex.Replace(html, @"]*>\s*", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);

     html = Regex.Replace(html, @"]*>\s*", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
     html = Regex.Replace(html, @"]*>\s*", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
     html = Regex.Replace(html, @"]*>\s*", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
     html = Regex.Replace(html, @"]*>\s*", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);

     html = Regex.Replace(html, @"]*>\s*", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
     html = Regex.Replace(html, @"]*>\s*", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
     html = Regex.Replace(html, @"]*>\s*", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
 
     html = Regex.Replace(html, @"]*/>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
     html = Regex.Replace(html, @"]*/>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);

     html = Regex.Replace(html, @"(style\s*=\s*\x22[^\x22]*)(\s*\x22\s*Style\s*=\s*\x22)", "$1;", RegexOptions.IgnoreCase|RegexOptions.Compiled);
 
     html = Regex.Replace(html, @"Class\s*=\x22?\s*\d*-\d*\s*\x22?", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
     html = Regex.Replace(html, @"()([\s\w]*)()", "$2", RegexOptions.IgnoreCase|RegexOptions.Compiled);
     html = html.Replace("®", "®");
     html = html.Replace("©", "©");
     html = html.Replace("™", "™");
     return html;
 }


0 comments: