using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using GitConverter.Lib.Factories; using GitConverter.Lib.Logging; using GitConverter.Lib.Models; using SharpCompress.Archives; namespace GitConverter.Lib.Converters { /// /// High-level conversion orchestrator. /// /// /// Responsibilities /// - Validate input / output / temp paths and prepare folders (delegates to ). /// - Determine input kind (single file vs archive) and select an appropriate converter key. /// - Dispatch to a resolved instance obtained from an . /// /// Detection & selection summary /// - Single-file inputs: /// - Use explicit extension-to-converter mapping for known extensions (e.g. .geojson -> "GeoJson", .topojson -> "TopoJson", .esrijson -> "EsriJson"). /// - For generic .json files perform a bounded head read (see HeaderReadLimit) and classify via . /// - NDJSON / GeoJsonSeq is only selected when the head contains at least NdjsonThreshold JSON-like lines to avoid misclassifying single GeoJSON objects. /// - Archive inputs: /// - Inspect archive entries (via ) and collect extension / marker information. /// - Apply per-entry JSON classification and majority voting when .json entries exist. A single majority selects the converter; a tie returns a friendly failure (reason contains "ambiguous JSON in archive"). /// - KMZ guard: prefer "Kmz" only when the outer archive filename extension is .kmz OR a top-level "doc.kml" entry exists at the root of the archive. /// - Fallback to requirement matching: a rule wins when all its required markers are present (first match wins). /// /// Safety & performance /// - Header reads are bounded to HeaderReadLimit bytes to avoid loading large files. /// - JSON entry reads for archives use streaming reads and are limited to the same head size. /// /// Logging & traceability /// - Logs major detection decisions and the resolved converter key. /// - When conversion succeeds the orchestrator may append the detection reason to the returned for traceability. /// /// Error handling /// - Uses to return friendly, actionable errors for expected problems (missing files, ambiguous detection, permission issues). /// - Avoids throwing for expected validation errors; unexpected exceptions are caught and returned as a failure result. /// /// Unit testing /// - Tests use FakeFactory / FakeConverter to assert dispatch decisions without performing real conversions. /// - Tests should assert on stable substrings (case-insensitive) for detection reasons and friendly failure messages to avoid brittle comparisons. /// public static class ConversionService { private const int NdjsonThreshold = 2; private const int HeaderReadLimit = 64 * 1024; // 64 KB private static readonly Dictionary _s_archiveRequirements = new Dictionary(StringComparer.OrdinalIgnoreCase) { { "EsriJson", new[] { ".json", ".esrijson" } }, { "GeoJson", new[] { ".geojson", ".json" } }, { "GeoJsonSeq", new[] { ".json" } }, { "Kml", new[] { ".kml" } }, { "Kmz", new[] { ".kml" } }, { "Shapefile", new[] { ".shp", ".shx", ".dbf" } }, { "Osm", new[] { ".osm" } }, { "Gdb", new[] { ".gdb" } }, { "Gpx", new[] { ".gpx" } }, { "TopoJson", new[] { ".json" } }, { "MapInfoInterchange", new[] { ".mif" } }, { "MapInfoTab", new[] { ".tab", ".dat", ".map", ".id" } }, { "Csv", new[] { ".csv" } }, { "GeoPackage", new[] { ".gpkg" } }, }; private static readonly Dictionary _s_extensionToConverter = new Dictionary(StringComparer.OrdinalIgnoreCase) { { ".geojson", "GeoJson" }, { ".topojson", "TopoJson" }, { ".esrijson", "EsriJson" }, { ".kml", "Kml" }, { ".kmz", "Kmz" }, { ".shp", "Shapefile" }, { ".osm", "Osm" }, { ".gpx", "Gpx" }, { ".gml", "Gml" }, { ".gdb", "Gdb" }, { ".mif", "MapInfoInterchange" }, { ".tab", "MapInfoTab" }, { ".map", "MapInfoTab" }, { ".dat", "MapInfoTab" }, { ".id", "MapInfoTab" }, { ".csv", "Csv" }, { ".gpkg", "GeoPackage" }, }; /// /// Orchestrate a conversion given paths and a factory. /// Note: outputFolderPath is expected to be a folder path (not a file path). /// public static ConversionResult Run(string gisInputFilePath, string outputFolderPath, string tempFolderPath, IConverterFactory factory = null) { try { Log.Info("ConversionService: Run invoked."); if (string.IsNullOrWhiteSpace(outputFolderPath)) { Log.Error("ConversionService: output folder path is required."); return ConversionResult.Failure("Output folder path is required."); } // Reject file-like paths: caller must provide a folder, not a file with extension. if (Path.HasExtension(outputFolderPath)) { Log.Error($"ConversionService: output path '{outputFolderPath}' appears to be a file. Provide a folder path instead."); return ConversionResult.Failure("Output path must be a folder path (no filename/extension)."); } // Validate inputs and prepare folders (ensure output folder writable and temp ready) var prep = ConverterUtils.ValidateAndPreparePaths(gisInputFilePath, outputFolderPath, tempFolderPath); if (prep != null) return prep; // validation failure if (factory == null) { factory = new ConverterFactory(); } // Archive handling if (ConverterUtils.IsArchiveFile(gisInputFilePath)) { Log.Info($"Input '{gisInputFilePath}' detected as archive. Inspecting contents."); var entries = ConverterUtils.TryListArchiveEntries(gisInputFilePath); if (entries == null) { Log.Error("Failed to list archive entries."); return ConversionResult.Failure("Failed to inspect archive contents."); } var matchedConverter = DetectConverterFromArchiveEntries(entries, gisInputFilePath, out string detectReason); if (string.IsNullOrEmpty(matchedConverter)) { Log.Warn("No converter matched archive contents (or match ambiguous)."); if (!string.IsNullOrEmpty(detectReason)) { Log.Info($"Archive detection reason: {detectReason}"); return ConversionResult.Failure(detectReason); } return ConversionResult.Failure("No converter matched archive contents or required files are missing or ambiguous."); } Log.Info($"Archive matched converter '{matchedConverter}'. Reason: {detectReason}"); if (!factory.TryCreate(matchedConverter, out var conv)) { Log.Error($"ConverterFactory failed to resolve converter '{matchedConverter}'."); return ConversionResult.Failure($"Converter for '{matchedConverter}' is not available."); } Log.Info($"Converter '{matchedConverter}' resolved. Invoking Convert(...). Detector reason: {detectReason}"); var convResult = conv.Convert(gisInputFilePath, matchedConverter, outputFolderPath, tempFolderPath); // bubble detection reason into result message for traceability on success if (convResult != null && convResult.IsSuccess && !string.IsNullOrWhiteSpace(detectReason)) { var msg = string.IsNullOrWhiteSpace(convResult.Message) ? $"detector reason: {detectReason}" : $"{convResult.Message} (detector reason: {detectReason})"; Log.Info($"Conversion succeeded for '{matchedConverter}'. {msg}"); return ConversionResult.Success(msg); } return convResult ?? ConversionResult.Failure("Converter returned null result."); } // single-file handling var ext = (Path.GetExtension(gisInputFilePath) ?? string.Empty).ToLowerInvariant(); Log.Info($"Input '{gisInputFilePath}' detected as single file with extension '{ext}'."); // Direct extension routing for explicit JSON-type extensions (no NDJSON sniff) if (_s_extensionToConverter.TryGetValue(ext, out var directConverter) && (ext == ".geojson" || ext == ".topojson" || ext == ".esrijson")) { Log.Info($"Mapped extension '{ext}' to converter '{directConverter}' (reason: explicit extension mapping). Attempting to resolve."); if (!factory.TryCreate(directConverter, out var convDirect)) { Log.Error($"ConverterFactory failed to resolve converter '{directConverter}'."); return ConversionResult.Failure($"Converter for '{directConverter}' is not available."); } return convDirect.Convert(gisInputFilePath, directConverter, outputFolderPath, tempFolderPath); } // For .json files run detection with NDJSON rule if (!string.IsNullOrWhiteSpace(ext) && ext.EndsWith("json", StringComparison.OrdinalIgnoreCase)) { JsonFormatDetector.Format jsonFmt = JsonFormatDetector.Format.Unknown; string reason = null; try { jsonFmt = JsonFormatDetector.DetectFromFile(gisInputFilePath); if (jsonFmt != JsonFormatDetector.Format.Unknown) reason = "JsonFormatDetector.DetectFromFile"; } catch (Exception detEx) { Log.Debug("JsonFormatDetector.DetectFromFile threw: " + detEx.Message + ". Will attempt lightweight header sniff."); jsonFmt = JsonFormatDetector.Format.Unknown; } if (jsonFmt == JsonFormatDetector.Format.Unknown) { var head = ReadHeadUtf8(gisInputFilePath, HeaderReadLimit); jsonFmt = ClassifyJsonHeader(head); if (jsonFmt == JsonFormatDetector.Format.GeoJsonSeq) reason = $"Header sniff: NDJSON heuristic (>= {NdjsonThreshold} JSON lines)"; else if (jsonFmt == JsonFormatDetector.Format.TopoJson) reason = "Header sniff: TopoJSON fingerprint"; else if (jsonFmt == JsonFormatDetector.Format.EsriJson) reason = "Header sniff: EsriJSON fingerprint"; else if (jsonFmt == JsonFormatDetector.Format.GeoJson) reason = "Header sniff: GeoJSON fingerprint (Feature/coordinates/FeatureCollection)"; else reason = "Header sniff: unknown"; } if (jsonFmt == JsonFormatDetector.Format.Unknown) { Log.Error("Unable to parse JSON input to determine specific JSON GIS format."); return ConversionResult.Failure("Unable to determine JSON format (GeoJson / EsriJson / GeoJsonSeq / TopoJson)."); } string converterKeyForJson = null; switch (jsonFmt) { case JsonFormatDetector.Format.GeoJson: converterKeyForJson = "GeoJson"; break; case JsonFormatDetector.Format.EsriJson: converterKeyForJson = "EsriJson"; break; case JsonFormatDetector.Format.GeoJsonSeq: converterKeyForJson = "GeoJsonSeq"; break; case JsonFormatDetector.Format.TopoJson: converterKeyForJson = "TopoJson"; break; default: converterKeyForJson = null; break; } if (string.IsNullOrWhiteSpace(converterKeyForJson)) { Log.Error("Failed to map detected JSON format to a converter key."); return ConversionResult.Failure("Failed to map JSON format to converter."); } Log.Info($"Detected JSON format '{jsonFmt}' (reason: {reason}). Resolving converter '{converterKeyForJson}'."); if (!factory.TryCreate(converterKeyForJson, out var convJson)) { Log.Error($"ConverterFactory failed to resolve converter '{converterKeyForJson}'."); return ConversionResult.Failure($"Converter for '{converterKeyForJson}' is not available."); } var convJsonResult = convJson.Convert(gisInputFilePath, converterKeyForJson, outputFolderPath, tempFolderPath); if (convJsonResult != null && convJsonResult.IsSuccess) { var msg = string.IsNullOrWhiteSpace(convJsonResult.Message) ? $"detected by json classifier: {reason}" : $"{convJsonResult.Message} (detected by json classifier: {reason})"; Log.Info($"Conversion succeeded for '{converterKeyForJson}'. {msg}"); return ConversionResult.Success(msg); } return convJsonResult ?? ConversionResult.Failure("Converter returned null result."); } // Non-json extension mapping if (!_s_extensionToConverter.TryGetValue(ext, out var converterKeyNonJson)) { Log.Warn($"No converter mapping for extension '{ext}'."); return ConversionResult.Failure($"Unknown input file type '{ext}'."); } Log.Info($"Mapped extension '{ext}' to converter '{converterKeyNonJson}' (reason: extension mapping). Attempting to resolve."); if (!factory.TryCreate(converterKeyNonJson, out var convNonJson)) { Log.Error($"ConverterFactory failed to resolve converter '{converterKeyNonJson}'."); return ConversionResult.Failure($"Converter for '{converterKeyNonJson}' is not available."); } Log.Info($"Converter '{converterKeyNonJson}' resolved. Invoking Convert(...)."); return convNonJson.Convert(gisInputFilePath, converterKeyNonJson, outputFolderPath, tempFolderPath); } catch (Exception ex) { Log.Error("Unexpected error in ConversionService.Run: " + ex.Message, ex); return ConversionResult.Failure("Unexpected error: " + ex.Message); } } /// /// Detect converter key from archive entry names. On tie for json voting returns null and sets reason. /// private static string DetectConverterFromArchiveEntries(IEnumerable entries, string outerPath, out string reason) { reason = null; var exts = new HashSet(StringComparer.OrdinalIgnoreCase); bool hasTopLevelDocKml = false; foreach (var e in entries ?? Enumerable.Empty()) { try { if (string.IsNullOrWhiteSpace(e)) continue; var ext = Path.GetExtension(e); if (!string.IsNullOrEmpty(ext)) exts.Add(ext.ToLowerInvariant()); var normalized = e.Replace('\\', '/').Trim('/'); if (string.Equals(normalized, "doc.kml", StringComparison.OrdinalIgnoreCase)) hasTopLevelDocKml = true; var segments = normalized.Split(new[] { '/' }, StringSplitOptions.RemoveEmptyEntries); foreach (var seg in segments) { var idx = seg.LastIndexOf('.'); if (idx > 0 && idx < seg.Length - 1) exts.Add(seg.Substring(idx).ToLowerInvariant()); if (seg.EndsWith(".gdb", StringComparison.OrdinalIgnoreCase)) exts.Add(".gdb"); } } catch { // ignore malformed names } } Log.Debug("Archive contains " + exts.Count + " distinct extensions / markers: " + string.Join(", ", exts)); string outerExt = string.Empty; try { if (!string.IsNullOrWhiteSpace(outerPath)) outerExt = Path.GetExtension(outerPath) ?? string.Empty; } catch { /* ignore */ } bool kmzGuardPassed = string.Equals(outerExt, ".kmz", StringComparison.OrdinalIgnoreCase) || hasTopLevelDocKml; // JSON voting if (exts.Contains(".json")) { try { var votes = new Dictionary(StringComparer.OrdinalIgnoreCase); using (var arc = ArchiveFactory.Open(outerPath)) { foreach (var entry in arc.Entries.Where(en => !en.IsDirectory)) { var entryName = Path.GetFileName(entry.Key ?? string.Empty); if (string.IsNullOrEmpty(entryName)) continue; if (!entryName.EndsWith(".json", StringComparison.OrdinalIgnoreCase)) continue; try { var head = ReadEntryHeadUtf8(entry, HeaderReadLimit); var fmt = ClassifyJsonHeader(head); switch (fmt) { case JsonFormatDetector.Format.TopoJson: votes.TryGetValue("TopoJson", out var t); votes["TopoJson"] = t + 1; break; case JsonFormatDetector.Format.EsriJson: votes.TryGetValue("EsriJson", out var e); votes["EsriJson"] = e + 1; break; case JsonFormatDetector.Format.GeoJsonSeq: votes.TryGetValue("GeoJsonSeq", out var s); votes["GeoJsonSeq"] = s + 1; break; case JsonFormatDetector.Format.GeoJson: votes.TryGetValue("GeoJson", out var g); votes["GeoJson"] = g + 1; break; default: break; } } catch (Exception exEntry) { Log.Debug("JSON entry sniffing failed for '" + entry.Key + "': " + exEntry.Message); } } } if (votes.Count > 0) { Log.Debug("JSON votes: " + string.Join(", ", votes.Select(kv => kv.Key + "=" + kv.Value))); var max = votes.Values.Max(); var winners = votes.Where(kv => kv.Value == max).Select(kv => kv.Key).ToArray(); if (winners.Length == 1) { reason = "JSON voting majority (" + winners[0] + "=" + max + ") over entries: " + string.Join(", ", votes.Select(kv => kv.Key + "=" + kv.Value)); Log.Debug(reason); return winners[0]; } // friendly failure message reason = "ambiguous JSON in archive—please specify format"; Log.Warn("Ambiguous JSON types inside archive (tie in votes): " + string.Join(", ", votes.Select(kv => kv.Key + "=" + kv.Value))); return null; } } catch (Exception ex) { Log.Debug("Failed to perform JSON-entry voting for archive '" + outerPath + "': " + ex.Message); // fall through to extension heuristics } } // KMZ guard if (kmzGuardPassed) { try { if (string.Equals(outerExt, ".kmz", StringComparison.OrdinalIgnoreCase)) { reason = "KMZ guard: outer .kmz extension"; Log.Debug(reason); return "Kmz"; } if (hasTopLevelDocKml) { reason = "KMZ guard: top-level doc.kml present"; Log.Debug(reason); return "Kmz"; } } catch { /* ignore */ } } // strict requirement match foreach (var kv in _s_archiveRequirements) { if (string.Equals(kv.Key, "Kmz", StringComparison.OrdinalIgnoreCase) && !kmzGuardPassed) continue; var required = kv.Value; if (required.All(r => exts.Contains(r))) { reason = "Requirement match: " + kv.Key; Log.Debug(reason); return kv.Key; } } Log.Debug("No archive-based converter match found."); return null; } private static string ReadHeadUtf8(string path, int maxBytes) { try { using (var fs = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read)) { var toRead = (int)Math.Min(maxBytes, fs.Length); var buffer = new byte[toRead]; var read = fs.Read(buffer, 0, toRead); return Encoding.UTF8.GetString(buffer, 0, read); } } catch (Exception ex) { Log.Debug("ReadHeadUtf8: failed to read head of '" + path + "': " + ex.Message); return string.Empty; } } private static string ReadEntryHeadUtf8(SharpCompress.Archives.IArchiveEntry entry, int maxBytes) { try { using (var s = entry.OpenEntryStream()) using (var ms = new MemoryStream()) { var buffer = new byte[8192]; int remaining = maxBytes; int read; while (remaining > 0 && (read = s.Read(buffer, 0, Math.Min(buffer.Length, remaining))) > 0) { ms.Write(buffer, 0, read); remaining -= read; } return Encoding.UTF8.GetString(ms.ToArray()); } } catch (Exception ex) { Log.Debug("ReadEntryHeadUtf8: failed to read entry '" + (entry?.Key ?? "") + "': " + ex.Message); return string.Empty; } } private static bool LooksLikeNdjson(string text, int threshold = NdjsonThreshold) { if (string.IsNullOrWhiteSpace(text)) return false; int count = 0; using (var sr = new StringReader(text)) { string line; while ((line = sr.ReadLine()) != null) { line = line.Trim(); if (line.Length == 0) continue; if (line.StartsWith("{") || line.StartsWith("[")) { if (++count >= threshold) return true; } else { break; } } } return false; } private static JsonFormatDetector.Format ClassifyJsonHeader(string head) { if (string.IsNullOrWhiteSpace(head)) return JsonFormatDetector.Format.Unknown; if (head.IndexOf("\"type\"", StringComparison.OrdinalIgnoreCase) >= 0 && head.IndexOf("\"topology\"", StringComparison.OrdinalIgnoreCase) >= 0) return JsonFormatDetector.Format.TopoJson; if (head.IndexOf("\"spatialReference\"", StringComparison.OrdinalIgnoreCase) >= 0 || head.IndexOf("\"geometryType\"", StringComparison.OrdinalIgnoreCase) >= 0 || head.IndexOf("\"attributes\"", StringComparison.OrdinalIgnoreCase) >= 0) return JsonFormatDetector.Format.EsriJson; if (LooksLikeNdjson(head, NdjsonThreshold)) return JsonFormatDetector.Format.GeoJsonSeq; if (head.IndexOf("\"FeatureCollection\"", StringComparison.OrdinalIgnoreCase) >= 0 || head.IndexOf("\"Feature\"", StringComparison.OrdinalIgnoreCase) >= 0 || head.IndexOf("\"coordinates\"", StringComparison.OrdinalIgnoreCase) >= 0) return JsonFormatDetector.Format.GeoJson; return JsonFormatDetector.Format.Unknown; } } }