using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using GitConverter.Lib.Factories; using GitConverter.Lib.Logging; using GitConverter.Lib.Models; using SharpCompress.Archives; namespace GitConverter.Lib.Converters { /// /// High-level conversion orchestrator. /// /// /// Responsibilities /// - Validate paths (input/output exists, temp can be created). /// - Inspect input: single file or archive. /// - Detect the best matching converter based on input extension or archive contents (required file extensions). /// - Resolve converter using ConverterFactory (TryCreate) and invoke its Convert method. /// - Log each step and return a friendly ConversionResult on expected failures (validation, unknown option, /// missing required files) rather than throwing. /// /// Archive & JSON detection notes /// - Archive detection is primarily extension/entry-driven (shp/shx/dbf -> Shapefile). /// - JSON inside archives is ambiguous (GeoJson / EsriJson / TopoJson / GeoJsonSeq). To reduce false positives: /// - When archive contains .json entries we perform a lightweight header read (<= 64KB per JSON entry), /// classify each candidate JSON entry, then vote across entries and pick the majority format. /// - A tie results in an ambiguous outcome and the orchestrator returns a friendly failure to prompt /// the caller to specify an explicit converter option. /// - KMZ is a zipped KML and is often recognized by a top-level "doc.kml". To avoid false-positives where /// a generic ZIP simply contains a nested .kml, prefer "Kmz" only when the outer filename is .kmz OR /// when a top-level doc.kml exists. /// - NDJSON/GeoJsonSeq detection requires at least two JSON-looking lines at file head to avoid /// misclassifying single-object GeoJSON as NDJSON. /// public static class ConversionService { private static readonly Dictionary _s_archiveRequirements = new Dictionary(StringComparer.OrdinalIgnoreCase) { { "EsriJson", new[] { ".json", ".esrijson" } }, { "GeoJson", new[] { ".geojson", ".json" } }, { "GeoJsonSeq", new[] { ".json" } }, { "Kml", new[] { ".kml" } }, { "Kmz", new[] { ".kml" } }, { "Shapefile", new[] { ".shp", ".shx", ".dbf" } }, { "Osm", new[] { ".osm" } }, { "Gdb", new[] { ".gdb" } }, { "Gpx", new[] { ".gpx" } }, { "TopoJson", new[] { ".json" } }, { "MapInfoInterchange", new[] { ".mif" } }, { "MapInfoTab", new[] { ".tab", ".dat", ".map", ".id" } }, { "Csv", new[] { ".csv" } }, { "GeoPackage", new[] { ".gpkg" } }, }; private static readonly Dictionary _s_extensionToConverter = new Dictionary(StringComparer.OrdinalIgnoreCase) { { ".geojson", "GeoJson" }, { ".esrijson", "EsriJson" }, { ".kml", "Kml" }, { ".kmz", "Kmz" }, { ".shp", "Shapefile" }, { ".osm", "Osm" }, { ".gpx", "Gpx" }, { ".gml", "Gml" }, { ".gdb", "Gdb" }, { ".mif", "MapInfoInterchange" }, { ".tab", "MapInfoTab" }, { ".map", "MapInfoTab" }, { ".dat", "MapInfoTab" }, { ".id", "MapInfoTab" }, { ".csv", "Csv" }, { ".gpkg", "GeoPackage" }, }; /// /// Orchestrate a conversion given paths and a factory. /// Note: outputFolderPath is expected to be a folder path (not a file path). /// public static ConversionResult Run(string gisInputFilePath, string outputFolderPath, string tempFolderPath, IConverterFactory factory = null) { try { Log.Info("ConversionService: Run invoked."); // Require an output FOLDER path (tests and callers expect folder semantics). if (string.IsNullOrWhiteSpace(outputFolderPath)) { Log.Error("ConversionService: output folder path is required."); return ConversionResult.Failure("Output folder path is required."); } // Reject file-like paths: caller must provide a folder, not a file with extension. if (Path.HasExtension(outputFolderPath)) { Log.Error($"ConversionService: output path '{outputFolderPath}' appears to be a file. Provide a folder path instead."); return ConversionResult.Failure("Output path must be a folder path (no filename/extension)."); } var outFolderForValidation = outputFolderPath; // Validate inputs and prepare folders (ensure output folder writable and temp ready) var prep = ConverterUtils.ValidateAndPreparePaths(gisInputFilePath, outFolderForValidation, tempFolderPath); if (prep != null) return prep; // validation failure if (factory == null) { factory = new ConverterFactory(); } // Determine input kind if (ConverterUtils.IsArchiveFile(gisInputFilePath)) { Log.Info($"Input '{gisInputFilePath}' detected as archive. Inspecting contents."); var entries = ConverterUtils.TryListArchiveEntries(gisInputFilePath); if (entries == null) { Log.Error("Failed to list archive entries."); return ConversionResult.Failure("Failed to inspect archive contents."); } var matchedConverter = DetectConverterFromArchiveEntries(entries, gisInputFilePath, out string detectReason); if (string.IsNullOrEmpty(matchedConverter)) { Log.Warn("No converter matched archive contents (or match ambiguous)."); if (!string.IsNullOrEmpty(detectReason)) Log.Info($"Archive detection reason: {detectReason}"); return ConversionResult.Failure("No converter matched archive contents or required files are missing or ambiguous."); } Log.Info($"Archive matched converter '{matchedConverter}'. Reason: {detectReason}"); if (!factory.TryCreate(matchedConverter, out var conv)) { Log.Error($"ConverterFactory failed to resolve converter '{matchedConverter}'."); return ConversionResult.Failure($"Converter for '{matchedConverter}' is not available."); } Log.Info($"Converter '{matchedConverter}' resolved. Invoking Convert(...)."); // converters expect an output folder path; pass the folder return conv.Convert(gisInputFilePath, matchedConverter, outputFolderPath, tempFolderPath); } else { var ext = Path.GetExtension(gisInputFilePath); Log.Info($"Input '{gisInputFilePath}' detected as single file with extension '{ext}'."); if (!string.IsNullOrWhiteSpace(ext) && ext.EndsWith("json", StringComparison.OrdinalIgnoreCase)) { // JSON detection: prefer JsonFormatDetector.DetectFromFile. JsonFormatDetector.Format jsonFmt = JsonFormatDetector.Format.Unknown; string reason = null; try { jsonFmt = JsonFormatDetector.DetectFromFile(gisInputFilePath); if (jsonFmt != JsonFormatDetector.Format.Unknown) reason = "JsonFormatDetector.DetectFromFile"; } catch (Exception detEx) { Log.Debug($"JsonFormatDetector.DetectFromFile threw: {detEx.Message}. Will attempt lightweight header sniff."); jsonFmt = JsonFormatDetector.Format.Unknown; } if (jsonFmt == JsonFormatDetector.Format.Unknown) { // Read a bounded head of the file (e.g., 64KB) to avoid loading huge files var head = ReadHeadUtf8(gisInputFilePath, maxBytes: 64 * 1024); jsonFmt = ClassifyJsonHeader(head); // Derive a reason string for traceability if (jsonFmt == JsonFormatDetector.Format.GeoJsonSeq) reason = "Header sniff: NDJSON heuristic (>=2 JSON lines)"; else if (jsonFmt == JsonFormatDetector.Format.TopoJson) reason = "Header sniff: TopoJSON fingerprint"; else if (jsonFmt == JsonFormatDetector.Format.EsriJson) reason = "Header sniff: EsriJSON fingerprint"; else if (jsonFmt == JsonFormatDetector.Format.GeoJson) reason = "Header sniff: GeoJSON fingerprint (Feature/coordinates/FeatureCollection)"; else reason = "Header sniff: unknown"; } if (jsonFmt == JsonFormatDetector.Format.Unknown) { Log.Error("Unable to parse JSON input to determine specific JSON GIS format."); return ConversionResult.Failure("Unable to determine JSON format (GeoJson / EsriJson / GeoJsonSeq / TopoJson)."); } string converterKeyForJson = null; switch (jsonFmt) { case JsonFormatDetector.Format.GeoJson: converterKeyForJson = "GeoJson"; break; case JsonFormatDetector.Format.EsriJson: converterKeyForJson = "EsriJson"; break; case JsonFormatDetector.Format.GeoJsonSeq: converterKeyForJson = "GeoJsonSeq"; break; case JsonFormatDetector.Format.TopoJson: converterKeyForJson = "TopoJson"; break; default: converterKeyForJson = null; break; } if (string.IsNullOrWhiteSpace(converterKeyForJson)) { Log.Error("Failed to map detected JSON format to a converter key."); return ConversionResult.Failure("Failed to map JSON format to converter."); } Log.Info($"Detected JSON format '{jsonFmt}' (reason: {reason}). Resolving converter '{converterKeyForJson}'."); if (!factory.TryCreate(converterKeyForJson, out var convJson)) { Log.Error($"ConverterFactory failed to resolve converter '{converterKeyForJson}'."); return ConversionResult.Failure($"Converter for '{converterKeyForJson}' is not available."); } return convJson.Convert(gisInputFilePath, converterKeyForJson, outputFolderPath, tempFolderPath); } if (!_s_extensionToConverter.TryGetValue(ext, out var converterKeyNonJson)) { Log.Warn($"No converter mapping for extension '{ext}'."); return ConversionResult.Failure($"Unknown input file type '{ext}'."); } Log.Info($"Mapped extension '{ext}' to converter '{converterKeyNonJson}' (reason: extension mapping). Attempting to resolve."); if (!factory.TryCreate(converterKeyNonJson, out var convNonJson)) { Log.Error($"ConverterFactory failed to resolve converter '{converterKeyNonJson}'."); return ConversionResult.Failure($"Converter for '{converterKeyNonJson}' is not available."); } Log.Info($"Converter '{converterKeyNonJson}' resolved. Invoking Convert(...)."); return convNonJson.Convert(gisInputFilePath, converterKeyNonJson, outputFolderPath, tempFolderPath); } } catch (Exception ex) { Log.Error($"Unexpected error in ConversionService.Run: {ex.Message}", ex); return ConversionResult.Failure($"Unexpected error: {ex.Message}"); } } /// /// Detect a converter key from the archive entries. /// /// Archive entry names (file paths inside archive). /// The archive path on disk (used for outer extension guard, e.g. .kmz). /// Outputs a diagnostic reason for the selection (for logging). /// /// Detection strategy: /// - Collect a set of discovered extensions and folder markers (e.g. .gdb from path segments). /// - If .json entries are present, perform lightweight header reads and a voting mechanism across JSON entries. /// The majority vote selects the converter. Ties are treated as ambiguous (returns null). /// - Heuristic: detect top-level "doc.kml" which is the KMZ convention; prefer Kmz only when the outer /// archive has a .kmz extension OR when doc.kml exists at top-level (to avoid false positives for generic ZIPs). /// - Match the discovered extensions against _s_archiveRequirements; the first requirement dictionary entry /// where all required markers are present is selected. /// - Returns null when no rule matches or when JSON-based detection is ambiguous. /// private static string DetectConverterFromArchiveEntries(IEnumerable entries, string outerPath, out string reason) { reason = null; var exts = new HashSet(StringComparer.OrdinalIgnoreCase); bool hasTopLevelDocKml = false; foreach (var e in entries ?? Enumerable.Empty()) { try { if (string.IsNullOrWhiteSpace(e)) continue; // Normal extension from the entry path (file or dir entry) var ext = Path.GetExtension(e); if (!string.IsNullOrEmpty(ext)) exts.Add(ext.ToLowerInvariant()); // Normalize to forward slashes for segment operations var normalized = e.Replace('\\', '/').Trim('/'); // top-level doc.kml heuristic (KMZ convention) if (string.Equals(normalized, "doc.kml", StringComparison.OrdinalIgnoreCase)) hasTopLevelDocKml = true; // Inspect path segments for folder markers ending with known suffixes (e.g., .gdb) var segments = normalized.Split(new[] { '/' }, StringSplitOptions.RemoveEmptyEntries); foreach (var seg in segments) { var idx = seg.LastIndexOf('.'); if (idx > 0 && idx < seg.Length - 1) { var segExt = seg.Substring(idx).ToLowerInvariant(); exts.Add(segExt); } // explicit .gdb folder marker (additional safety) if (seg.EndsWith(".gdb", StringComparison.OrdinalIgnoreCase)) exts.Add(".gdb"); } } catch { // ignore malformed names } } Log.Debug($"Archive contains {exts.Count} distinct extensions / markers: {string.Join(", ", exts)}"); // If there are JSON entries, perform JSON entry voting to disambiguate among JSON formats. if (exts.Contains(".json")) { try { var votes = new Dictionary(StringComparer.OrdinalIgnoreCase); var entryNames = new List(); using (var arc = ArchiveFactory.Open(outerPath)) { foreach (var entry in arc.Entries.Where(e => !e.IsDirectory)) { // use filename to check extension (robust to path segments inside archive) var entryName = Path.GetFileName(entry.Key ?? string.Empty); if (string.IsNullOrEmpty(entryName)) continue; if (!entryName.EndsWith(".json", StringComparison.OrdinalIgnoreCase)) continue; entryNames.Add(entry.Key); try { var head = ReadEntryHeadUtf8(entry, maxBytes: 64 * 1024); var fmt = ClassifyJsonHeader(head); switch (fmt) { case JsonFormatDetector.Format.TopoJson: votes.TryGetValue("TopoJson", out var tcount1); votes["TopoJson"] = tcount1 + 1; break; case JsonFormatDetector.Format.EsriJson: votes.TryGetValue("EsriJson", out var tcount2); votes["EsriJson"] = tcount2 + 1; break; case JsonFormatDetector.Format.GeoJsonSeq: votes.TryGetValue("GeoJsonSeq", out var tcount3); votes["GeoJsonSeq"] = tcount3 + 1; break; case JsonFormatDetector.Format.GeoJson: votes.TryGetValue("GeoJson", out var tcount4); votes["GeoJson"] = tcount4 + 1; break; default: break; } } catch (Exception exEntry) { Log.Debug($"JSON entry sniffing failed for '{entry.Key}': {exEntry.Message}"); } } } if (votes.Count > 0) { Log.Debug($"JSON votes: {string.Join(", ", votes.Select(kv => kv.Key + "=" + kv.Value))}"); var max = votes.Values.Max(); var winners = votes.Where(kv => kv.Value == max).Select(kv => kv.Key).ToArray(); if (winners.Length == 1) { reason = $"JSON voting majority ({winners[0]}={max}) over entries: {string.Join(", ", votes.Select(kv => kv.Key + "=" + kv.Value))}"; Log.Debug($"JSON majority selected '{winners[0]}'. Reason: {reason}"); return winners[0]; } else { reason = $"JSON voting tie across entries: {string.Join(", ", votes.Select(kv => kv.Key + "=" + kv.Value))}"; Log.Warn("Ambiguous JSON types inside archive (tie in votes); failing with friendly message."); return null; // ambiguous } } // else fall through to strict extension-based matching } catch (Exception ex) { Log.Debug($"Failed to perform JSON-entry voting for archive '{outerPath}': {ex.Message}"); // fall through to the other detection heuristics } } // KMZ guard: prefer real KMZ only when outer is .kmz or top-level doc.kml exists. try { if (!string.IsNullOrWhiteSpace(outerPath)) { var outerExt = Path.GetExtension(outerPath) ?? string.Empty; if (string.Equals(outerExt, ".kmz", StringComparison.OrdinalIgnoreCase)) { reason = "KMZ guard: outer .kmz extension"; Log.Debug(reason); return "Kmz"; } if (hasTopLevelDocKml) { reason = "KMZ guard: top-level doc.kml present"; Log.Debug(reason); return "Kmz"; } } } catch { // ignore any path parsing issues and continue to strict matching } // strict requirement match (first matching rule wins) foreach (var kv in _s_archiveRequirements) { var required = kv.Value; var allPresent = required.All(r => exts.Contains(r)); if (allPresent) { reason = $"Requirement match: {kv.Key}"; Log.Debug(reason); return kv.Key; } } Log.Debug("No archive-based converter match found."); return null; } /// /// Read up to bytes from the start of the file and decode as UTF8. /// This avoids loading very large inputs entirely when we only need a header snippet for format detection. /// private static string ReadHeadUtf8(string path, int maxBytes = 64 * 1024) { try { using (var fs = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read)) { var toRead = (int)Math.Min(maxBytes, fs.Length); var buffer = new byte[toRead]; var read = fs.Read(buffer, 0, toRead); return Encoding.UTF8.GetString(buffer, 0, read); } } catch (Exception ex) { Log.Debug($"ReadHeadUtf8: failed to read head of '{path}': {ex.Message}"); return string.Empty; } } /// /// Read up to bytes from the start of an archive entry stream and decode as UTF8. /// private static string ReadEntryHeadUtf8(SharpCompress.Archives.IArchiveEntry entry, int maxBytes = 64 * 1024) { try { using (var s = entry.OpenEntryStream()) { var ms = new MemoryStream(); var buffer = new byte[8192]; int remaining = maxBytes; int read; while (remaining > 0 && (read = s.Read(buffer, 0, Math.Min(buffer.Length, remaining))) > 0) { ms.Write(buffer, 0, read); remaining -= read; } return Encoding.UTF8.GetString(ms.ToArray()); } } catch (Exception ex) { Log.Debug($"ReadEntryHeadUtf8: failed to read entry '{entry?.Key}': {ex.Message}"); return string.Empty; } } /// /// Heuristic: count JSON-looking non-empty lines at file head. /// Returns true when at least lines start with '{' or '['. /// Stops early if a non-JSON-looking non-empty line is encountered. /// /// /// Rationale: /// - NDJSON (GeoJSONSeq) should contain multiple JSON objects separated by newlines. /// - Single-file GeoJSON that happens to start with '{' must not be misclassified as NDJSON. /// - Using a threshold (default 2) reduces false positives: we require at least two JSON-like lines. /// private static bool LooksLikeNdjson(string text, int threshold = 2) { if (string.IsNullOrWhiteSpace(text)) return false; int count = 0; using (var sr = new StringReader(text)) { string line; while ((line = sr.ReadLine()) != null) { line = line.Trim(); if (line.Length == 0) continue; if (line.StartsWith("{") || line.StartsWith("[")) { if (++count >= threshold) return true; } else { // a non-JSON token breaks NDJSON expectation early break; } } } return false; } /// /// Classify a JSON header/snippet into a JsonFormatDetector.Format. /// Uses fingerprints for TopoJSON and EsriJSON, requires >=2 JSON-like lines for NDJSON, /// and treats single Feature/Geometry objects as GeoJSON when "Feature" or "coordinates" are present. /// private static JsonFormatDetector.Format ClassifyJsonHeader(string head) { if (string.IsNullOrWhiteSpace(head)) return JsonFormatDetector.Format.Unknown; // TopoJSON fingerprint if (head.IndexOf("\"type\"", StringComparison.OrdinalIgnoreCase) >= 0 && head.IndexOf("\"topology\"", StringComparison.OrdinalIgnoreCase) >= 0) { return JsonFormatDetector.Format.TopoJson; } // EsriJSON heuristics: spatialReference / geometryType / attributes typical keys if (head.IndexOf("\"spatialReference\"", StringComparison.OrdinalIgnoreCase) >= 0 || head.IndexOf("\"geometryType\"", StringComparison.OrdinalIgnoreCase) >= 0 || head.IndexOf("\"attributes\"", StringComparison.OrdinalIgnoreCase) >= 0) { return JsonFormatDetector.Format.EsriJson; } // NDJSON / GeoJsonSeq: require at least 2 JSON-looking lines at head if (LooksLikeNdjson(head, threshold: 2)) return JsonFormatDetector.Format.GeoJsonSeq; // GeoJSON: FeatureCollection/Feature or geometry w/ coordinates or single-feature indications if (head.IndexOf("\"FeatureCollection\"", StringComparison.OrdinalIgnoreCase) >= 0 || head.IndexOf("\"Feature\"", StringComparison.OrdinalIgnoreCase) >= 0 || head.IndexOf("\"coordinates\"", StringComparison.OrdinalIgnoreCase) >= 0) { return JsonFormatDetector.Format.GeoJson; } return JsonFormatDetector.Format.Unknown; } } }