using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using GitConverter.Lib.Factories;
using GitConverter.Lib.Logging;
using GitConverter.Lib.Models;
using SharpCompress.Archives;
namespace GitConverter.Lib.Converters
{
///
/// High-level conversion orchestrator.
///
///
/// Responsibilities
/// - Validate input / output / temp paths and prepare folders (delegates to ).
/// - Determine input kind (single file vs archive) and select an appropriate converter key.
/// - Dispatch to a resolved instance obtained from an .
///
/// Detection & selection summary
/// - Single-file inputs:
/// - Use explicit extension-to-converter mapping for known extensions (e.g. .geojson -> "GeoJson", .topojson -> "TopoJson", .esrijson -> "EsriJson").
/// - For generic .json files perform a bounded head read (see HeaderReadLimit) and classify via .
/// - NDJSON / GeoJsonSeq is only selected when the head contains at least NdjsonThreshold JSON-like lines to avoid misclassifying single GeoJSON objects.
/// - Archive inputs:
/// - Inspect archive entries (via ) and collect extension / marker information.
/// - Apply per-entry JSON classification and majority voting when .json entries exist. A single majority selects the converter; a tie returns a friendly failure (reason contains "ambiguous JSON in archive").
/// - KMZ guard: prefer "Kmz" only when the outer archive filename extension is .kmz OR a top-level "doc.kml" entry exists at the root of the archive.
/// - Fallback to requirement matching: a rule wins when all its required markers are present (first match wins).
///
/// Safety & performance
/// - Header reads are bounded to HeaderReadLimit bytes to avoid loading large files.
/// - JSON entry reads for archives use streaming reads and are limited to the same head size.
///
/// Logging & traceability
/// - Logs major detection decisions and the resolved converter key.
/// - When conversion succeeds the orchestrator may append the detection reason to the returned for traceability.
///
/// Error handling
/// - Uses to return friendly, actionable errors for expected problems (missing files, ambiguous detection, permission issues).
/// - Avoids throwing for expected validation errors; unexpected exceptions are caught and returned as a failure result.
///
/// Unit testing
/// - Tests use FakeFactory / FakeConverter to assert dispatch decisions without performing real conversions.
/// - Tests should assert on stable substrings (case-insensitive) for detection reasons and friendly failure messages to avoid brittle comparisons.
///
public static class ConversionService
{
private const int NdjsonThreshold = 2;
private const int HeaderReadLimit = 64 * 1024; // 64 KB
private static readonly Dictionary _s_archiveRequirements = new Dictionary(StringComparer.OrdinalIgnoreCase)
{
{ "EsriJson", new[] { ".json", ".esrijson" } },
{ "GeoJson", new[] { ".geojson", ".json" } },
{ "GeoJsonSeq", new[] { ".json" } },
{ "Kml", new[] { ".kml" } },
{ "Kmz", new[] { ".kml" } },
{ "Shapefile", new[] { ".shp", ".shx", ".dbf" } },
{ "Osm", new[] { ".osm" } },
{ "Gdb", new[] { ".gdb" } },
{ "Gpx", new[] { ".gpx" } },
{ "TopoJson", new[] { ".json" } },
{ "MapInfoInterchange", new[] { ".mif" } },
{ "MapInfoTab", new[] { ".tab", ".dat", ".map", ".id" } },
{ "Csv", new[] { ".csv" } },
{ "GeoPackage", new[] { ".gpkg" } },
};
private static readonly Dictionary _s_extensionToConverter = new Dictionary(StringComparer.OrdinalIgnoreCase)
{
{ ".geojson", "GeoJson" },
{ ".topojson", "TopoJson" },
{ ".esrijson", "EsriJson" },
{ ".kml", "Kml" },
{ ".kmz", "Kmz" },
{ ".shp", "Shapefile" },
{ ".osm", "Osm" },
{ ".gpx", "Gpx" },
{ ".gml", "Gml" },
{ ".gdb", "Gdb" },
{ ".mif", "MapInfoInterchange" },
{ ".tab", "MapInfoTab" },
{ ".map", "MapInfoTab" },
{ ".dat", "MapInfoTab" },
{ ".id", "MapInfoTab" },
{ ".csv", "Csv" },
{ ".gpkg", "GeoPackage" },
};
///
/// Orchestrate a conversion given paths and a factory.
/// Note: outputFolderPath is expected to be a folder path (not a file path).
///
public static ConversionResult Run(string gisInputFilePath, string outputFolderPath, string tempFolderPath, IConverterFactory factory = null)
{
try
{
Log.Info("ConversionService: Run invoked.");
if (string.IsNullOrWhiteSpace(outputFolderPath))
{
Log.Error("ConversionService: output folder path is required.");
return ConversionResult.Failure("Output folder path is required.");
}
// Reject file-like paths: caller must provide a folder, not a file with extension.
if (Path.HasExtension(outputFolderPath))
{
Log.Error($"ConversionService: output path '{outputFolderPath}' appears to be a file. Provide a folder path instead.");
return ConversionResult.Failure("Output path must be a folder path (no filename/extension).");
}
// Validate inputs and prepare folders (ensure output folder writable and temp ready)
var prep = ConverterUtils.ValidateAndPreparePaths(gisInputFilePath, outputFolderPath, tempFolderPath);
if (prep != null) return prep; // validation failure
if (factory == null)
{
factory = new ConverterFactory();
}
// Archive handling
if (ConverterUtils.IsArchiveFile(gisInputFilePath))
{
Log.Info($"Input '{gisInputFilePath}' detected as archive. Inspecting contents.");
var entries = ConverterUtils.TryListArchiveEntries(gisInputFilePath);
if (entries == null)
{
Log.Error("Failed to list archive entries.");
return ConversionResult.Failure("Failed to inspect archive contents.");
}
var matchedConverter = DetectConverterFromArchiveEntries(entries, gisInputFilePath, out string detectReason);
if (string.IsNullOrEmpty(matchedConverter))
{
Log.Warn("No converter matched archive contents (or match ambiguous).");
if (!string.IsNullOrEmpty(detectReason))
{
Log.Info($"Archive detection reason: {detectReason}");
return ConversionResult.Failure(detectReason);
}
return ConversionResult.Failure("No converter matched archive contents or required files are missing or ambiguous.");
}
Log.Info($"Archive matched converter '{matchedConverter}'. Reason: {detectReason}");
if (!factory.TryCreate(matchedConverter, out var conv))
{
Log.Error($"ConverterFactory failed to resolve converter '{matchedConverter}'.");
return ConversionResult.Failure($"Converter for '{matchedConverter}' is not available.");
}
Log.Info($"Converter '{matchedConverter}' resolved. Invoking Convert(...). Detector reason: {detectReason}");
var convResult = conv.Convert(gisInputFilePath, matchedConverter, outputFolderPath, tempFolderPath);
// bubble detection reason into result message for traceability on success
if (convResult != null && convResult.IsSuccess && !string.IsNullOrWhiteSpace(detectReason))
{
var msg = string.IsNullOrWhiteSpace(convResult.Message) ? $"detector reason: {detectReason}" : $"{convResult.Message} (detector reason: {detectReason})";
Log.Info($"Conversion succeeded for '{matchedConverter}'. {msg}");
return ConversionResult.Success(msg);
}
return convResult ?? ConversionResult.Failure("Converter returned null result.");
}
// single-file handling
var ext = (Path.GetExtension(gisInputFilePath) ?? string.Empty).ToLowerInvariant();
Log.Info($"Input '{gisInputFilePath}' detected as single file with extension '{ext}'.");
// Direct extension routing for explicit JSON-type extensions (no NDJSON sniff)
if (_s_extensionToConverter.TryGetValue(ext, out var directConverter) && (ext == ".geojson" || ext == ".topojson" || ext == ".esrijson"))
{
Log.Info($"Mapped extension '{ext}' to converter '{directConverter}' (reason: explicit extension mapping). Attempting to resolve.");
if (!factory.TryCreate(directConverter, out var convDirect))
{
Log.Error($"ConverterFactory failed to resolve converter '{directConverter}'.");
return ConversionResult.Failure($"Converter for '{directConverter}' is not available.");
}
return convDirect.Convert(gisInputFilePath, directConverter, outputFolderPath, tempFolderPath);
}
// For .json files run detection with NDJSON rule
if (!string.IsNullOrWhiteSpace(ext) && ext.EndsWith("json", StringComparison.OrdinalIgnoreCase))
{
JsonFormatDetector.Format jsonFmt = JsonFormatDetector.Format.Unknown;
string reason = null;
try
{
jsonFmt = JsonFormatDetector.DetectFromFile(gisInputFilePath);
if (jsonFmt != JsonFormatDetector.Format.Unknown)
reason = "JsonFormatDetector.DetectFromFile";
}
catch (Exception detEx)
{
Log.Debug("JsonFormatDetector.DetectFromFile threw: " + detEx.Message + ". Will attempt lightweight header sniff.");
jsonFmt = JsonFormatDetector.Format.Unknown;
}
if (jsonFmt == JsonFormatDetector.Format.Unknown)
{
var head = ReadHeadUtf8(gisInputFilePath, HeaderReadLimit);
jsonFmt = ClassifyJsonHeader(head);
if (jsonFmt == JsonFormatDetector.Format.GeoJsonSeq)
reason = $"Header sniff: NDJSON heuristic (>= {NdjsonThreshold} JSON lines)";
else if (jsonFmt == JsonFormatDetector.Format.TopoJson)
reason = "Header sniff: TopoJSON fingerprint";
else if (jsonFmt == JsonFormatDetector.Format.EsriJson)
reason = "Header sniff: EsriJSON fingerprint";
else if (jsonFmt == JsonFormatDetector.Format.GeoJson)
reason = "Header sniff: GeoJSON fingerprint (Feature/coordinates/FeatureCollection)";
else
reason = "Header sniff: unknown";
}
if (jsonFmt == JsonFormatDetector.Format.Unknown)
{
Log.Error("Unable to parse JSON input to determine specific JSON GIS format.");
return ConversionResult.Failure("Unable to determine JSON format (GeoJson / EsriJson / GeoJsonSeq / TopoJson).");
}
string converterKeyForJson = null;
switch (jsonFmt)
{
case JsonFormatDetector.Format.GeoJson:
converterKeyForJson = "GeoJson";
break;
case JsonFormatDetector.Format.EsriJson:
converterKeyForJson = "EsriJson";
break;
case JsonFormatDetector.Format.GeoJsonSeq:
converterKeyForJson = "GeoJsonSeq";
break;
case JsonFormatDetector.Format.TopoJson:
converterKeyForJson = "TopoJson";
break;
default:
converterKeyForJson = null;
break;
}
if (string.IsNullOrWhiteSpace(converterKeyForJson))
{
Log.Error("Failed to map detected JSON format to a converter key.");
return ConversionResult.Failure("Failed to map JSON format to converter.");
}
Log.Info($"Detected JSON format '{jsonFmt}' (reason: {reason}). Resolving converter '{converterKeyForJson}'.");
if (!factory.TryCreate(converterKeyForJson, out var convJson))
{
Log.Error($"ConverterFactory failed to resolve converter '{converterKeyForJson}'.");
return ConversionResult.Failure($"Converter for '{converterKeyForJson}' is not available.");
}
var convJsonResult = convJson.Convert(gisInputFilePath, converterKeyForJson, outputFolderPath, tempFolderPath);
if (convJsonResult != null && convJsonResult.IsSuccess)
{
var msg = string.IsNullOrWhiteSpace(convJsonResult.Message) ? $"detected by json classifier: {reason}" : $"{convJsonResult.Message} (detected by json classifier: {reason})";
Log.Info($"Conversion succeeded for '{converterKeyForJson}'. {msg}");
return ConversionResult.Success(msg);
}
return convJsonResult ?? ConversionResult.Failure("Converter returned null result.");
}
// Non-json extension mapping
if (!_s_extensionToConverter.TryGetValue(ext, out var converterKeyNonJson))
{
Log.Warn($"No converter mapping for extension '{ext}'.");
return ConversionResult.Failure($"Unknown input file type '{ext}'.");
}
Log.Info($"Mapped extension '{ext}' to converter '{converterKeyNonJson}' (reason: extension mapping). Attempting to resolve.");
if (!factory.TryCreate(converterKeyNonJson, out var convNonJson))
{
Log.Error($"ConverterFactory failed to resolve converter '{converterKeyNonJson}'.");
return ConversionResult.Failure($"Converter for '{converterKeyNonJson}' is not available.");
}
Log.Info($"Converter '{converterKeyNonJson}' resolved. Invoking Convert(...).");
return convNonJson.Convert(gisInputFilePath, converterKeyNonJson, outputFolderPath, tempFolderPath);
}
catch (Exception ex)
{
Log.Error("Unexpected error in ConversionService.Run: " + ex.Message, ex);
return ConversionResult.Failure("Unexpected error: " + ex.Message);
}
}
///
/// Detect converter key from archive entry names. On tie for json voting returns null and sets reason.
///
private static string DetectConverterFromArchiveEntries(IEnumerable entries, string outerPath, out string reason)
{
reason = null;
var exts = new HashSet(StringComparer.OrdinalIgnoreCase);
bool hasTopLevelDocKml = false;
foreach (var e in entries ?? Enumerable.Empty())
{
try
{
if (string.IsNullOrWhiteSpace(e)) continue;
var ext = Path.GetExtension(e);
if (!string.IsNullOrEmpty(ext))
exts.Add(ext.ToLowerInvariant());
var normalized = e.Replace('\\', '/').Trim('/');
if (string.Equals(normalized, "doc.kml", StringComparison.OrdinalIgnoreCase))
hasTopLevelDocKml = true;
var segments = normalized.Split(new[] { '/' }, StringSplitOptions.RemoveEmptyEntries);
foreach (var seg in segments)
{
var idx = seg.LastIndexOf('.');
if (idx > 0 && idx < seg.Length - 1)
exts.Add(seg.Substring(idx).ToLowerInvariant());
if (seg.EndsWith(".gdb", StringComparison.OrdinalIgnoreCase))
exts.Add(".gdb");
}
}
catch
{
// ignore malformed names
}
}
Log.Debug("Archive contains " + exts.Count + " distinct extensions / markers: " + string.Join(", ", exts));
string outerExt = string.Empty;
try
{
if (!string.IsNullOrWhiteSpace(outerPath))
outerExt = Path.GetExtension(outerPath) ?? string.Empty;
}
catch { /* ignore */ }
bool kmzGuardPassed = string.Equals(outerExt, ".kmz", StringComparison.OrdinalIgnoreCase) || hasTopLevelDocKml;
// JSON voting
if (exts.Contains(".json"))
{
try
{
var votes = new Dictionary(StringComparer.OrdinalIgnoreCase);
using (var arc = ArchiveFactory.Open(outerPath))
{
foreach (var entry in arc.Entries.Where(en => !en.IsDirectory))
{
var entryName = Path.GetFileName(entry.Key ?? string.Empty);
if (string.IsNullOrEmpty(entryName)) continue;
if (!entryName.EndsWith(".json", StringComparison.OrdinalIgnoreCase)) continue;
try
{
var head = ReadEntryHeadUtf8(entry, HeaderReadLimit);
var fmt = ClassifyJsonHeader(head);
switch (fmt)
{
case JsonFormatDetector.Format.TopoJson:
votes.TryGetValue("TopoJson", out var t); votes["TopoJson"] = t + 1;
break;
case JsonFormatDetector.Format.EsriJson:
votes.TryGetValue("EsriJson", out var e); votes["EsriJson"] = e + 1;
break;
case JsonFormatDetector.Format.GeoJsonSeq:
votes.TryGetValue("GeoJsonSeq", out var s); votes["GeoJsonSeq"] = s + 1;
break;
case JsonFormatDetector.Format.GeoJson:
votes.TryGetValue("GeoJson", out var g); votes["GeoJson"] = g + 1;
break;
default:
break;
}
}
catch (Exception exEntry)
{
Log.Debug("JSON entry sniffing failed for '" + entry.Key + "': " + exEntry.Message);
}
}
}
if (votes.Count > 0)
{
Log.Debug("JSON votes: " + string.Join(", ", votes.Select(kv => kv.Key + "=" + kv.Value)));
var max = votes.Values.Max();
var winners = votes.Where(kv => kv.Value == max).Select(kv => kv.Key).ToArray();
if (winners.Length == 1)
{
reason = "JSON voting majority (" + winners[0] + "=" + max + ") over entries: " + string.Join(", ", votes.Select(kv => kv.Key + "=" + kv.Value));
Log.Debug(reason);
return winners[0];
}
// friendly failure message
reason = "ambiguous JSON in archive—please specify format";
Log.Warn("Ambiguous JSON types inside archive (tie in votes): " + string.Join(", ", votes.Select(kv => kv.Key + "=" + kv.Value)));
return null;
}
}
catch (Exception ex)
{
Log.Debug("Failed to perform JSON-entry voting for archive '" + outerPath + "': " + ex.Message);
// fall through to extension heuristics
}
}
// KMZ guard
if (kmzGuardPassed)
{
try
{
if (string.Equals(outerExt, ".kmz", StringComparison.OrdinalIgnoreCase))
{
reason = "KMZ guard: outer .kmz extension";
Log.Debug(reason);
return "Kmz";
}
if (hasTopLevelDocKml)
{
reason = "KMZ guard: top-level doc.kml present";
Log.Debug(reason);
return "Kmz";
}
}
catch { /* ignore */ }
}
// strict requirement match
foreach (var kv in _s_archiveRequirements)
{
if (string.Equals(kv.Key, "Kmz", StringComparison.OrdinalIgnoreCase) && !kmzGuardPassed)
continue;
var required = kv.Value;
if (required.All(r => exts.Contains(r)))
{
reason = "Requirement match: " + kv.Key;
Log.Debug(reason);
return kv.Key;
}
}
Log.Debug("No archive-based converter match found.");
return null;
}
private static string ReadHeadUtf8(string path, int maxBytes)
{
try
{
using (var fs = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read))
{
var toRead = (int)Math.Min(maxBytes, fs.Length);
var buffer = new byte[toRead];
var read = fs.Read(buffer, 0, toRead);
return Encoding.UTF8.GetString(buffer, 0, read);
}
}
catch (Exception ex)
{
Log.Debug("ReadHeadUtf8: failed to read head of '" + path + "': " + ex.Message);
return string.Empty;
}
}
private static string ReadEntryHeadUtf8(SharpCompress.Archives.IArchiveEntry entry, int maxBytes)
{
try
{
using (var s = entry.OpenEntryStream())
using (var ms = new MemoryStream())
{
var buffer = new byte[8192];
int remaining = maxBytes;
int read;
while (remaining > 0 && (read = s.Read(buffer, 0, Math.Min(buffer.Length, remaining))) > 0)
{
ms.Write(buffer, 0, read);
remaining -= read;
}
return Encoding.UTF8.GetString(ms.ToArray());
}
}
catch (Exception ex)
{
Log.Debug("ReadEntryHeadUtf8: failed to read entry '" + (entry?.Key ?? "") + "': " + ex.Message);
return string.Empty;
}
}
private static bool LooksLikeNdjson(string text, int threshold = NdjsonThreshold)
{
if (string.IsNullOrWhiteSpace(text)) return false;
int count = 0;
using (var sr = new StringReader(text))
{
string line;
while ((line = sr.ReadLine()) != null)
{
line = line.Trim();
if (line.Length == 0) continue;
if (line.StartsWith("{") || line.StartsWith("["))
{
if (++count >= threshold) return true;
}
else
{
break;
}
}
}
return false;
}
private static JsonFormatDetector.Format ClassifyJsonHeader(string head)
{
if (string.IsNullOrWhiteSpace(head)) return JsonFormatDetector.Format.Unknown;
if (head.IndexOf("\"type\"", StringComparison.OrdinalIgnoreCase) >= 0 &&
head.IndexOf("\"topology\"", StringComparison.OrdinalIgnoreCase) >= 0)
return JsonFormatDetector.Format.TopoJson;
if (head.IndexOf("\"spatialReference\"", StringComparison.OrdinalIgnoreCase) >= 0 ||
head.IndexOf("\"geometryType\"", StringComparison.OrdinalIgnoreCase) >= 0 ||
head.IndexOf("\"attributes\"", StringComparison.OrdinalIgnoreCase) >= 0)
return JsonFormatDetector.Format.EsriJson;
if (LooksLikeNdjson(head, NdjsonThreshold))
return JsonFormatDetector.Format.GeoJsonSeq;
if (head.IndexOf("\"FeatureCollection\"", StringComparison.OrdinalIgnoreCase) >= 0 ||
head.IndexOf("\"Feature\"", StringComparison.OrdinalIgnoreCase) >= 0 ||
head.IndexOf("\"coordinates\"", StringComparison.OrdinalIgnoreCase) >= 0)
return JsonFormatDetector.Format.GeoJson;
return JsonFormatDetector.Format.Unknown;
}
}
}