using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using GitConverter.Lib.Factories;
using GitConverter.Lib.Logging;
using GitConverter.Lib.Models;
using SharpCompress.Archives;
namespace GitConverter.Lib.Converters
{
///
/// High-level conversion orchestrator.
///
///
/// Responsibilities
/// - Validate paths (input/output exists, temp can be created).
/// - Inspect input: single file or archive.
/// - Detect the best matching converter based on input extension or archive contents (required file extensions).
/// - Resolve converter using ConverterFactory (TryCreate) and invoke its Convert method.
/// - Log each step and return a friendly ConversionResult on expected failures (validation, unknown option,
/// missing required files) rather than throwing.
///
/// Archive & JSON detection notes
/// - Archive detection is primarily extension/entry-driven (shp/shx/dbf -> Shapefile).
/// - JSON inside archives is ambiguous (GeoJson / EsriJson / TopoJson / GeoJsonSeq). To reduce false positives:
/// - When archive contains .json entries we perform a lightweight header read (<= 64KB per JSON entry),
/// classify each candidate JSON entry, then vote across entries and pick the majority format.
/// - A tie results in an ambiguous outcome and the orchestrator returns a friendly failure to prompt
/// the caller to specify an explicit converter option.
/// - KMZ is a zipped KML and is often recognized by a top-level "doc.kml". To avoid false-positives where
/// a generic ZIP simply contains a nested .kml, prefer "Kmz" only when the outer filename is .kmz OR
/// when a top-level doc.kml exists.
/// - NDJSON/GeoJsonSeq detection requires at least two JSON-looking lines at file head to avoid
/// misclassifying single-object GeoJSON as NDJSON.
///
public static class ConversionService
{
private static readonly Dictionary _s_archiveRequirements = new Dictionary(StringComparer.OrdinalIgnoreCase)
{
{ "EsriJson", new[] { ".json", ".esrijson" } },
{ "GeoJson", new[] { ".geojson", ".json" } },
{ "GeoJsonSeq", new[] { ".json" } },
{ "Kml", new[] { ".kml" } },
{ "Kmz", new[] { ".kml" } },
{ "Shapefile", new[] { ".shp", ".shx", ".dbf" } },
{ "Osm", new[] { ".osm" } },
{ "Gdb", new[] { ".gdb" } },
{ "Gpx", new[] { ".gpx" } },
{ "TopoJson", new[] { ".json" } },
{ "MapInfoInterchange", new[] { ".mif" } },
{ "MapInfoTab", new[] { ".tab", ".dat", ".map", ".id" } },
{ "Csv", new[] { ".csv" } },
{ "GeoPackage", new[] { ".gpkg" } },
};
private static readonly Dictionary _s_extensionToConverter = new Dictionary(StringComparer.OrdinalIgnoreCase)
{
{ ".geojson", "GeoJson" },
{ ".esrijson", "EsriJson" },
{ ".kml", "Kml" },
{ ".kmz", "Kmz" },
{ ".shp", "Shapefile" },
{ ".osm", "Osm" },
{ ".gpx", "Gpx" },
{ ".gml", "Gml" },
{ ".gdb", "Gdb" },
{ ".mif", "MapInfoInterchange" },
{ ".tab", "MapInfoTab" },
{ ".map", "MapInfoTab" },
{ ".dat", "MapInfoTab" },
{ ".id", "MapInfoTab" },
{ ".csv", "Csv" },
{ ".gpkg", "GeoPackage" },
};
///
/// Orchestrate a conversion given paths and a factory.
/// Note: outputFolderPath is expected to be a folder path (not a file path).
///
public static ConversionResult Run(string gisInputFilePath, string outputFolderPath, string tempFolderPath, IConverterFactory factory = null)
{
try
{
Log.Info("ConversionService: Run invoked.");
// Require an output FOLDER path (tests and callers expect folder semantics).
if (string.IsNullOrWhiteSpace(outputFolderPath))
{
Log.Error("ConversionService: output folder path is required.");
return ConversionResult.Failure("Output folder path is required.");
}
// Reject file-like paths: caller must provide a folder, not a file with extension.
if (Path.HasExtension(outputFolderPath))
{
Log.Error($"ConversionService: output path '{outputFolderPath}' appears to be a file. Provide a folder path instead.");
return ConversionResult.Failure("Output path must be a folder path (no filename/extension).");
}
var outFolderForValidation = outputFolderPath;
// Validate inputs and prepare folders (ensure output folder writable and temp ready)
var prep = ConverterUtils.ValidateAndPreparePaths(gisInputFilePath, outFolderForValidation, tempFolderPath);
if (prep != null) return prep; // validation failure
if (factory == null)
{
factory = new ConverterFactory();
}
// Determine input kind
if (ConverterUtils.IsArchiveFile(gisInputFilePath))
{
Log.Info($"Input '{gisInputFilePath}' detected as archive. Inspecting contents.");
var entries = ConverterUtils.TryListArchiveEntries(gisInputFilePath);
if (entries == null)
{
Log.Error("Failed to list archive entries.");
return ConversionResult.Failure("Failed to inspect archive contents.");
}
var matchedConverter = DetectConverterFromArchiveEntries(entries, gisInputFilePath, out string detectReason);
if (string.IsNullOrEmpty(matchedConverter))
{
Log.Warn("No converter matched archive contents (or match ambiguous).");
if (!string.IsNullOrEmpty(detectReason))
Log.Info($"Archive detection reason: {detectReason}");
return ConversionResult.Failure("No converter matched archive contents or required files are missing or ambiguous.");
}
Log.Info($"Archive matched converter '{matchedConverter}'. Reason: {detectReason}");
if (!factory.TryCreate(matchedConverter, out var conv))
{
Log.Error($"ConverterFactory failed to resolve converter '{matchedConverter}'.");
return ConversionResult.Failure($"Converter for '{matchedConverter}' is not available.");
}
Log.Info($"Converter '{matchedConverter}' resolved. Invoking Convert(...).");
// converters expect an output folder path; pass the folder
return conv.Convert(gisInputFilePath, matchedConverter, outputFolderPath, tempFolderPath);
}
else
{
var ext = Path.GetExtension(gisInputFilePath);
Log.Info($"Input '{gisInputFilePath}' detected as single file with extension '{ext}'.");
if (!string.IsNullOrWhiteSpace(ext) && ext.EndsWith("json", StringComparison.OrdinalIgnoreCase))
{
// JSON detection: prefer JsonFormatDetector.DetectFromFile.
JsonFormatDetector.Format jsonFmt = JsonFormatDetector.Format.Unknown;
string reason = null;
try
{
jsonFmt = JsonFormatDetector.DetectFromFile(gisInputFilePath);
if (jsonFmt != JsonFormatDetector.Format.Unknown)
reason = "JsonFormatDetector.DetectFromFile";
}
catch (Exception detEx)
{
Log.Debug($"JsonFormatDetector.DetectFromFile threw: {detEx.Message}. Will attempt lightweight header sniff.");
jsonFmt = JsonFormatDetector.Format.Unknown;
}
if (jsonFmt == JsonFormatDetector.Format.Unknown)
{
// Read a bounded head of the file (e.g., 64KB) to avoid loading huge files
var head = ReadHeadUtf8(gisInputFilePath, maxBytes: 64 * 1024);
jsonFmt = ClassifyJsonHeader(head);
// Derive a reason string for traceability
if (jsonFmt == JsonFormatDetector.Format.GeoJsonSeq)
reason = "Header sniff: NDJSON heuristic (>=2 JSON lines)";
else if (jsonFmt == JsonFormatDetector.Format.TopoJson)
reason = "Header sniff: TopoJSON fingerprint";
else if (jsonFmt == JsonFormatDetector.Format.EsriJson)
reason = "Header sniff: EsriJSON fingerprint";
else if (jsonFmt == JsonFormatDetector.Format.GeoJson)
reason = "Header sniff: GeoJSON fingerprint (Feature/coordinates/FeatureCollection)";
else
reason = "Header sniff: unknown";
}
if (jsonFmt == JsonFormatDetector.Format.Unknown)
{
Log.Error("Unable to parse JSON input to determine specific JSON GIS format.");
return ConversionResult.Failure("Unable to determine JSON format (GeoJson / EsriJson / GeoJsonSeq / TopoJson).");
}
string converterKeyForJson = null;
switch (jsonFmt)
{
case JsonFormatDetector.Format.GeoJson:
converterKeyForJson = "GeoJson";
break;
case JsonFormatDetector.Format.EsriJson:
converterKeyForJson = "EsriJson";
break;
case JsonFormatDetector.Format.GeoJsonSeq:
converterKeyForJson = "GeoJsonSeq";
break;
case JsonFormatDetector.Format.TopoJson:
converterKeyForJson = "TopoJson";
break;
default:
converterKeyForJson = null;
break;
}
if (string.IsNullOrWhiteSpace(converterKeyForJson))
{
Log.Error("Failed to map detected JSON format to a converter key.");
return ConversionResult.Failure("Failed to map JSON format to converter.");
}
Log.Info($"Detected JSON format '{jsonFmt}' (reason: {reason}). Resolving converter '{converterKeyForJson}'.");
if (!factory.TryCreate(converterKeyForJson, out var convJson))
{
Log.Error($"ConverterFactory failed to resolve converter '{converterKeyForJson}'.");
return ConversionResult.Failure($"Converter for '{converterKeyForJson}' is not available.");
}
return convJson.Convert(gisInputFilePath, converterKeyForJson, outputFolderPath, tempFolderPath);
}
if (!_s_extensionToConverter.TryGetValue(ext, out var converterKeyNonJson))
{
Log.Warn($"No converter mapping for extension '{ext}'.");
return ConversionResult.Failure($"Unknown input file type '{ext}'.");
}
Log.Info($"Mapped extension '{ext}' to converter '{converterKeyNonJson}' (reason: extension mapping). Attempting to resolve.");
if (!factory.TryCreate(converterKeyNonJson, out var convNonJson))
{
Log.Error($"ConverterFactory failed to resolve converter '{converterKeyNonJson}'.");
return ConversionResult.Failure($"Converter for '{converterKeyNonJson}' is not available.");
}
Log.Info($"Converter '{converterKeyNonJson}' resolved. Invoking Convert(...).");
return convNonJson.Convert(gisInputFilePath, converterKeyNonJson, outputFolderPath, tempFolderPath);
}
}
catch (Exception ex)
{
Log.Error($"Unexpected error in ConversionService.Run: {ex.Message}", ex);
return ConversionResult.Failure($"Unexpected error: {ex.Message}");
}
}
///
/// Detect a converter key from the archive entries.
///
/// Archive entry names (file paths inside archive).
/// The archive path on disk (used for outer extension guard, e.g. .kmz).
/// Outputs a diagnostic reason for the selection (for logging).
///
/// Detection strategy:
/// - Collect a set of discovered extensions and folder markers (e.g. .gdb from path segments).
/// - If .json entries are present, perform lightweight header reads and a voting mechanism across JSON entries.
/// The majority vote selects the converter. Ties are treated as ambiguous (returns null).
/// - Heuristic: detect top-level "doc.kml" which is the KMZ convention; prefer Kmz only when the outer
/// archive has a .kmz extension OR when doc.kml exists at top-level (to avoid false positives for generic ZIPs).
/// - Match the discovered extensions against _s_archiveRequirements; the first requirement dictionary entry
/// where all required markers are present is selected.
/// - Returns null when no rule matches or when JSON-based detection is ambiguous.
///
private static string DetectConverterFromArchiveEntries(IEnumerable entries, string outerPath, out string reason)
{
reason = null;
var exts = new HashSet(StringComparer.OrdinalIgnoreCase);
bool hasTopLevelDocKml = false;
foreach (var e in entries ?? Enumerable.Empty())
{
try
{
if (string.IsNullOrWhiteSpace(e)) continue;
// Normal extension from the entry path (file or dir entry)
var ext = Path.GetExtension(e);
if (!string.IsNullOrEmpty(ext))
exts.Add(ext.ToLowerInvariant());
// Normalize to forward slashes for segment operations
var normalized = e.Replace('\\', '/').Trim('/');
// top-level doc.kml heuristic (KMZ convention)
if (string.Equals(normalized, "doc.kml", StringComparison.OrdinalIgnoreCase))
hasTopLevelDocKml = true;
// Inspect path segments for folder markers ending with known suffixes (e.g., .gdb)
var segments = normalized.Split(new[] { '/' }, StringSplitOptions.RemoveEmptyEntries);
foreach (var seg in segments)
{
var idx = seg.LastIndexOf('.');
if (idx > 0 && idx < seg.Length - 1)
{
var segExt = seg.Substring(idx).ToLowerInvariant();
exts.Add(segExt);
}
// explicit .gdb folder marker (additional safety)
if (seg.EndsWith(".gdb", StringComparison.OrdinalIgnoreCase))
exts.Add(".gdb");
}
}
catch
{
// ignore malformed names
}
}
Log.Debug($"Archive contains {exts.Count} distinct extensions / markers: {string.Join(", ", exts)}");
// If there are JSON entries, perform JSON entry voting to disambiguate among JSON formats.
if (exts.Contains(".json"))
{
try
{
var votes = new Dictionary(StringComparer.OrdinalIgnoreCase);
var entryNames = new List();
using (var arc = ArchiveFactory.Open(outerPath))
{
foreach (var entry in arc.Entries.Where(e => !e.IsDirectory))
{
// use filename to check extension (robust to path segments inside archive)
var entryName = Path.GetFileName(entry.Key ?? string.Empty);
if (string.IsNullOrEmpty(entryName)) continue;
if (!entryName.EndsWith(".json", StringComparison.OrdinalIgnoreCase)) continue;
entryNames.Add(entry.Key);
try
{
var head = ReadEntryHeadUtf8(entry, maxBytes: 64 * 1024);
var fmt = ClassifyJsonHeader(head);
switch (fmt)
{
case JsonFormatDetector.Format.TopoJson:
votes.TryGetValue("TopoJson", out var tcount1);
votes["TopoJson"] = tcount1 + 1;
break;
case JsonFormatDetector.Format.EsriJson:
votes.TryGetValue("EsriJson", out var tcount2);
votes["EsriJson"] = tcount2 + 1;
break;
case JsonFormatDetector.Format.GeoJsonSeq:
votes.TryGetValue("GeoJsonSeq", out var tcount3);
votes["GeoJsonSeq"] = tcount3 + 1;
break;
case JsonFormatDetector.Format.GeoJson:
votes.TryGetValue("GeoJson", out var tcount4);
votes["GeoJson"] = tcount4 + 1;
break;
default:
break;
}
}
catch (Exception exEntry)
{
Log.Debug($"JSON entry sniffing failed for '{entry.Key}': {exEntry.Message}");
}
}
}
if (votes.Count > 0)
{
Log.Debug($"JSON votes: {string.Join(", ", votes.Select(kv => kv.Key + "=" + kv.Value))}");
var max = votes.Values.Max();
var winners = votes.Where(kv => kv.Value == max).Select(kv => kv.Key).ToArray();
if (winners.Length == 1)
{
reason = $"JSON voting majority ({winners[0]}={max}) over entries: {string.Join(", ", votes.Select(kv => kv.Key + "=" + kv.Value))}";
Log.Debug($"JSON majority selected '{winners[0]}'. Reason: {reason}");
return winners[0];
}
else
{
reason = $"JSON voting tie across entries: {string.Join(", ", votes.Select(kv => kv.Key + "=" + kv.Value))}";
Log.Warn("Ambiguous JSON types inside archive (tie in votes); failing with friendly message.");
return null; // ambiguous
}
}
// else fall through to strict extension-based matching
}
catch (Exception ex)
{
Log.Debug($"Failed to perform JSON-entry voting for archive '{outerPath}': {ex.Message}");
// fall through to the other detection heuristics
}
}
// KMZ guard: prefer real KMZ only when outer is .kmz or top-level doc.kml exists.
try
{
if (!string.IsNullOrWhiteSpace(outerPath))
{
var outerExt = Path.GetExtension(outerPath) ?? string.Empty;
if (string.Equals(outerExt, ".kmz", StringComparison.OrdinalIgnoreCase))
{
reason = "KMZ guard: outer .kmz extension";
Log.Debug(reason);
return "Kmz";
}
if (hasTopLevelDocKml)
{
reason = "KMZ guard: top-level doc.kml present";
Log.Debug(reason);
return "Kmz";
}
}
}
catch
{
// ignore any path parsing issues and continue to strict matching
}
// strict requirement match (first matching rule wins)
foreach (var kv in _s_archiveRequirements)
{
var required = kv.Value;
var allPresent = required.All(r => exts.Contains(r));
if (allPresent)
{
reason = $"Requirement match: {kv.Key}";
Log.Debug(reason);
return kv.Key;
}
}
Log.Debug("No archive-based converter match found.");
return null;
}
///
/// Read up to bytes from the start of the file and decode as UTF8.
/// This avoids loading very large inputs entirely when we only need a header snippet for format detection.
///
private static string ReadHeadUtf8(string path, int maxBytes = 64 * 1024)
{
try
{
using (var fs = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read))
{
var toRead = (int)Math.Min(maxBytes, fs.Length);
var buffer = new byte[toRead];
var read = fs.Read(buffer, 0, toRead);
return Encoding.UTF8.GetString(buffer, 0, read);
}
}
catch (Exception ex)
{
Log.Debug($"ReadHeadUtf8: failed to read head of '{path}': {ex.Message}");
return string.Empty;
}
}
///
/// Read up to bytes from the start of an archive entry stream and decode as UTF8.
///
private static string ReadEntryHeadUtf8(SharpCompress.Archives.IArchiveEntry entry, int maxBytes = 64 * 1024)
{
try
{
using (var s = entry.OpenEntryStream())
{
var ms = new MemoryStream();
var buffer = new byte[8192];
int remaining = maxBytes;
int read;
while (remaining > 0 && (read = s.Read(buffer, 0, Math.Min(buffer.Length, remaining))) > 0)
{
ms.Write(buffer, 0, read);
remaining -= read;
}
return Encoding.UTF8.GetString(ms.ToArray());
}
}
catch (Exception ex)
{
Log.Debug($"ReadEntryHeadUtf8: failed to read entry '{entry?.Key}': {ex.Message}");
return string.Empty;
}
}
///
/// Heuristic: count JSON-looking non-empty lines at file head.
/// Returns true when at least lines start with '{' or '['.
/// Stops early if a non-JSON-looking non-empty line is encountered.
///
///
/// Rationale:
/// - NDJSON (GeoJSONSeq) should contain multiple JSON objects separated by newlines.
/// - Single-file GeoJSON that happens to start with '{' must not be misclassified as NDJSON.
/// - Using a threshold (default 2) reduces false positives: we require at least two JSON-like lines.
///
private static bool LooksLikeNdjson(string text, int threshold = 2)
{
if (string.IsNullOrWhiteSpace(text)) return false;
int count = 0;
using (var sr = new StringReader(text))
{
string line;
while ((line = sr.ReadLine()) != null)
{
line = line.Trim();
if (line.Length == 0) continue;
if (line.StartsWith("{") || line.StartsWith("["))
{
if (++count >= threshold) return true;
}
else
{
// a non-JSON token breaks NDJSON expectation early
break;
}
}
}
return false;
}
///
/// Classify a JSON header/snippet into a JsonFormatDetector.Format.
/// Uses fingerprints for TopoJSON and EsriJSON, requires >=2 JSON-like lines for NDJSON,
/// and treats single Feature/Geometry objects as GeoJSON when "Feature" or "coordinates" are present.
///
private static JsonFormatDetector.Format ClassifyJsonHeader(string head)
{
if (string.IsNullOrWhiteSpace(head)) return JsonFormatDetector.Format.Unknown;
// TopoJSON fingerprint
if (head.IndexOf("\"type\"", StringComparison.OrdinalIgnoreCase) >= 0 &&
head.IndexOf("\"topology\"", StringComparison.OrdinalIgnoreCase) >= 0)
{
return JsonFormatDetector.Format.TopoJson;
}
// EsriJSON heuristics: spatialReference / geometryType / attributes typical keys
if (head.IndexOf("\"spatialReference\"", StringComparison.OrdinalIgnoreCase) >= 0 ||
head.IndexOf("\"geometryType\"", StringComparison.OrdinalIgnoreCase) >= 0 ||
head.IndexOf("\"attributes\"", StringComparison.OrdinalIgnoreCase) >= 0)
{
return JsonFormatDetector.Format.EsriJson;
}
// NDJSON / GeoJsonSeq: require at least 2 JSON-looking lines at head
if (LooksLikeNdjson(head, threshold: 2))
return JsonFormatDetector.Format.GeoJsonSeq;
// GeoJSON: FeatureCollection/Feature or geometry w/ coordinates or single-feature indications
if (head.IndexOf("\"FeatureCollection\"", StringComparison.OrdinalIgnoreCase) >= 0 ||
head.IndexOf("\"Feature\"", StringComparison.OrdinalIgnoreCase) >= 0 ||
head.IndexOf("\"coordinates\"", StringComparison.OrdinalIgnoreCase) >= 0)
{
return JsonFormatDetector.Format.GeoJson;
}
return JsonFormatDetector.Format.Unknown;
}
}
}