using Microsoft.SharePoint.Client;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using System.Xml;
namespace ConfluenceToSharePointMigrator
{
class HTMLFileProcess
{
///
/// Get the page name from either html file name or title from the html content
///
/// file name of html
/// html file path
///
public string SharePointPageNameFromHtml(string pageName, string htmlFile)
{
int fileNamePosition = pageName.LastIndexOf('_');
pageName = pageName.Replace("---", "@$");
pageName = pageName.Replace("-", " ");
if (pageName.Contains('_'))
fileNamePosition = pageName.LastIndexOf('_');
else
fileNamePosition = pageName.LastIndexOf('.');
pageName = pageName.Replace(pageName.Substring(fileNamePosition, pageName.Length - fileNamePosition), "").Replace("@$", " - ");
return pageName;
}
///
/// Remove the unwanted content like header, author.
///
/// html file content
///
public string FrameHTMLContentForSharePointPage(string fileContent)
{
//Remove main-header
Regex headerPattern = new Regex(@"
.*?\\d\\d, \\d\\d\\d\\d", "");
//Remove title
fileContent = Regex.Replace(fileContent, "
.*?", "");
//Remove attachment content section
Regex attachmentPattern = new Regex("
(.*?\n.*?)*.*?");
fileContent = attachmentPattern.Replace(fileContent, "