.*?

using Microsoft.SharePoint.Client; using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using System.Text.RegularExpressions; using System.Threading.Tasks; using System.Xml; namespace ConfluenceToSharePointMigrator { class HTMLFileProcess { ///

/// Get the page name from either html file name or title from the html content ///

/// file name of html /// html file path /// public string SharePointPageNameFromHtml(string pageName, string htmlFile) { int fileNamePosition = pageName.LastIndexOf('_'); pageName = pageName.Replace("---", "@$"); pageName = pageName.Replace("-", " "); if (pageName.Contains('_')) fileNamePosition = pageName.LastIndexOf('_'); else fileNamePosition = pageName.LastIndexOf('.'); pageName = pageName.Replace(pageName.Substring(fileNamePosition, pageName.Length - fileNamePosition), "").Replace("@$", " - "); return pageName; } ///

/// Remove the unwanted content like header, author. ///

/// html file content /// public string FrameHTMLContentForSharePointPage(string fileContent) { //Remove main-header Regex headerPattern = new Regex(@"

.*?\\d\\d, \\d\\d\\d\\d", ""); //Remove title fileContent = Regex.Replace(fileContent, ".*?", ""); //Remove attachment content section Regex attachmentPattern = new Regex("

(.*?\n.*?)*.*?"); fileContent = attachmentPattern.Replace(fileContent, "

"); //Remove the document generated information fileContent = Regex.Replace(fileContent, "

Document generated by Confluence on.*?

", ""); fileContent = Regex.Replace(fileContent, "