using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using GitConverter.Lib.Converters; using GitConverter.Lib.Logging; using SharpCompress.Archives; namespace GitConverter.Lib.Factories { /// /// Helpers to pick a converter from an input path (file or archive). /// Consumers can call TryCreateForInput to get an IConverter resolved from an IConverterFactory /// by inspecting the input path (without extracting archives). /// /// /// Responsibilities /// - Validate and interpret the provided input path (single file or archive) and choose an appropriate /// converter key that can be passed to an instance. /// - For archive inputs prefer filename/listing-only inspection (via ) /// to avoid extraction; when archive entries are only generic .json files this type will perform bounded /// header reads of .json entries (streaming reads only) and apply the same JSON classification used for single files. /// /// Detection & selection summary /// - Single-file inputs: /// - Use explicit extension-to-converter mapping for known extensions (for example .geojson and .esrijson). /// - For generic .json files attempt a best-effort classification: /// - Call JsonFormatDetector.DetectFromFile when available. /// - Fall back to a bounded UTF‑8 header read (see HeaderReadLimit) and classify with ClassifyJsonHeader. /// - GeoJSON sequence / NDJSON is only selected when the header contains at least NdjsonThreshold JSON-like lines. /// - TopoJSON is detected via header fingerprints (Topological "type" + "topology") rather than relying on a distinct file suffix. /// - Archive inputs: /// - Inspect entry names (no extraction) and collect extension/marker information (e.g. .shp, .gdb, .kml, .json). /// - KMZ guard: prefer "Kmz" when the outer archive filename ends with .kmz or a top-level doc.kml entry exists. /// - If the archive contains only generic .json entries this class will open each .json entry stream and /// perform bounded header classification per entry, then apply a majority vote to choose between /// GeoJson / EsriJson / GeoJsonSeq / TopoJson. A tied vote returns failure (ambiguous). /// - Otherwise apply strict requirement matching: a rule wins when all required markers are present. /// /// Safety & performance /// - Header reads are bounded by HeaderReadLimit to avoid loading large files into memory. /// - Archive entry reads use streaming reads from SharpCompress and only pull up to the same head limit. /// - The extension inspection phase avoids opening archives where possible; only the minimal set of entry /// streams is opened when necessary to disambiguate JSON flavors. /// /// Logging & traceability /// - Methods return a detector reason string to assist callers with logging and diagnostics. /// - The implementation logs major detection decisions and reasons via . /// /// Error handling /// - Friendly failure behavior: the method returns false when detection fails or is ambiguous and sets /// to a human-readable explanation. It does not throw for expected validation errors. /// - Unexpected exceptions are caught, logged and surfaced via the returned detect reason and a false return value. /// /// Unit testing /// - Tests should exercise both archive filename-only heuristics and the JSON-entry voting logic. /// - When testing archive JSON voting use small synthetic archives with controlled .json entries to produce deterministic votes. /// - Tests should assert on stable substrings of the detector reason for resilience against minor wording changes. /// public static class ConverterFactoryInputExtensions { private const int NdjsonThreshold = 2; private const int HeaderReadLimit = 64 * 1024; // 64 KB // Minimal extension->converter map. private static readonly Dictionary _s_extensionToConverter = new Dictionary(StringComparer.OrdinalIgnoreCase) { { ".geojson", "GeoJson" }, { ".esrijson", "EsriJson" }, { ".kml", "Kml" }, { ".kmz", "Kmz" }, { ".shp", "Shapefile" }, { ".osm", "Osm" }, { ".gpx", "Gpx" }, { ".gml", "Gml" }, { ".gdb", "Gdb" }, { ".mif", "MapInfoInterchange" }, { ".tab", "MapInfoTab" }, { ".map", "MapInfoTab" }, { ".dat", "MapInfoTab" }, { ".id", "MapInfoTab" }, { ".csv", "Csv" }, { ".gpkg", "GeoPackage" }, }; // Archive requirements. // Note: entries that only rely on the generic ".json" suffix are intentionally NOT added here // (they are disambiguated by header sniffing / voting instead). TopoJson is detected by header fingerprints. private static readonly Dictionary _s_archiveRequirements = new Dictionary(StringComparer.OrdinalIgnoreCase) { { "EsriJson", new[] { ".esrijson" } }, { "Gml", new[] { ".gml" } }, { "GeoJson", new[] { ".geojson" } }, { "Kml", new[] { ".kml" } }, { "Kmz", new[] { ".kml" } }, { "Shapefile", new[] { ".shp", ".shx", ".dbf" } }, { "Osm", new[] { ".osm" } }, { "Gdb", new[] { ".gdb" } }, { "Gpx", new[] { ".gpx" } }, { "MapInfoInterchange", new[] { ".mif" } }, { "MapInfoTab", new[] { ".tab", ".dat", ".map", ".id" } }, { "Csv", new[] { ".csv" } }, { "GeoPackage", new[] { ".gpkg" } }, }; /// /// Inspect the input path (file or archive) and attempt to resolve a converter from the factory. /// Lightweight overload that returns only the converter. For diagnostics use the overload that returns detectReason. /// /// Factory used to resolve a converter key to an instance. /// Path to a single GIS file or archive to inspect. /// When true is returned this parameter contains the resolved converter instance. /// True when a converter was successfully resolved; false otherwise. /// /// - This overload simply forwards to . /// - Prefer the overload that returns when you need logging/diagnostics about the detection step. /// public static bool TryCreateForInput(this IConverterFactory factory, string gisInputFilePath, out IConverter converter) { string ignored; return TryCreateForInput(factory, gisInputFilePath, out converter, out ignored); } /// /// Inspect the input path (file or archive) and attempt to resolve a converter from the factory. /// /// Factory used to resolve a converter key to an instance. /// Path to a single GIS file or archive to inspect. /// Out parameter populated with the resolved converter when the method returns true. /// Human friendly reason describing how the converter was selected (useful for logging). /// True when a converter was resolved; false when detection failed or result was ambiguous. /// /// Behaviour details /// - Returns false and sets when: /// - The input path is invalid or cannot be inspected. /// - Archive inspection is ambiguous (for example tied JSON votes). /// - No matching converter mapping or requirement rules are found. /// - Detection steps (high-level): /// 1. If the input looks like an archive (see ): /// a. Use to obtain entry names (no extraction). /// b. Build a set of discovered extensions / markers and apply fast "wins" (explicit .geojson/.esrijson). /// c. Apply the KMZ guard (outer .kmz or top-level doc.kml => Kmz). /// d. If archive contains only generic .json entries, open each .json entry and perform bounded header reads /// (see ReadEntryHeadUtf8) and classify via ClassifyJsonHeader; then apply majority voting. /// e. Otherwise apply strict requirement matching against . /// 2. For single-file inputs: /// a. Use explicit extension mapping for .geojson and .esrijson. /// b. For generic .json files invoke JsonFormatDetector.DetectFromFile (if available) then fall back /// to a bounded header read + ClassifyJsonHeader. /// c. Map the detected JSON format to a converter key (GeoJson / EsriJson / GeoJsonSeq / TopoJson). /// /// Safety and IO /// - This method avoids extracting archive contents. When entry-stream reads are required they are bounded /// to HeaderReadLimit bytes and performed via streaming to minimize memory usage. /// - Unexpected exceptions are caught; the method logs details and returns false with a detect reason describing the problem. /// public static bool TryCreateForInput(this IConverterFactory factory, string gisInputFilePath, out IConverter converter, out string detectReason) { if (factory == null) throw new ArgumentNullException(nameof(factory)); converter = null; detectReason = null; if (string.IsNullOrWhiteSpace(gisInputFilePath)) { Log.Error("TryCreateForInput: input path required."); return false; } try { // Archive case: inspect names first (do NOT extract files). If only generic .json markers are present // fall back to bounded header reads of .json entries and perform majority voting. if (ConverterUtils.IsArchiveFile(gisInputFilePath)) { var entries = ConverterUtils.TryListArchiveEntries(gisInputFilePath); if (entries == null) { detectReason = "Failed to list archive entries."; Log.Debug(detectReason); return false; } var exts = new HashSet(StringComparer.OrdinalIgnoreCase); bool hasTopLevelDocKml = false; foreach (var e in entries ?? Enumerable.Empty()) { if (string.IsNullOrWhiteSpace(e)) continue; var entryExt = Path.GetExtension(e); if (!string.IsNullOrEmpty(entryExt)) exts.Add(entryExt.ToLowerInvariant()); var normalized = e.Replace('\\', '/').Trim('/'); if (string.Equals(normalized, "doc.kml", StringComparison.OrdinalIgnoreCase)) hasTopLevelDocKml = true; var segments = normalized.Split(new[] { '/' }, StringSplitOptions.RemoveEmptyEntries); foreach (var seg in segments) { var idx = seg.LastIndexOf('.'); if (idx > 0 && idx < seg.Length - 1) exts.Add(seg.Substring(idx).ToLowerInvariant()); if (seg.EndsWith(".gdb", StringComparison.OrdinalIgnoreCase)) exts.Add(".gdb"); } } string outerExt = string.Empty; try { outerExt = Path.GetExtension(gisInputFilePath) ?? string.Empty; } catch { /* ignore */ } bool kmzGuardPassed = string.Equals(outerExt, ".kmz", StringComparison.OrdinalIgnoreCase) || hasTopLevelDocKml; // Fast wins from explicit archive entry extensions (without opening entries) if (exts.Contains(".geojson")) { detectReason = "Archive filename contains .geojson entries"; return factory.TryCreate("GeoJson", out converter); } if (exts.Contains(".esrijson")) { detectReason = "Archive filename contains .esrijson entries"; return factory.TryCreate("EsriJson", out converter); } // KMZ guard if (kmzGuardPassed) { if (string.Equals(outerExt, ".kmz", StringComparison.OrdinalIgnoreCase) || hasTopLevelDocKml) { detectReason = "KMZ guard detected (outer .kmz or top-level doc.kml)"; return factory.TryCreate("Kmz", out converter); } } // If archive contains only .json markers (no more specific filename indicators), // open .json entries and perform header-based classification + voting (bounded reads). if (exts.Contains(".json") && !exts.Overlaps(new[] { ".geojson", ".esrijson" })) { try { var votes = new Dictionary(StringComparer.OrdinalIgnoreCase); using (var arc = ArchiveFactory.Open(gisInputFilePath)) { foreach (var entry in arc.Entries.Where(en => !en.IsDirectory)) { var entryName = Path.GetFileName(entry.Key ?? string.Empty); if (string.IsNullOrEmpty(entryName)) continue; if (!entryName.EndsWith(".json", StringComparison.OrdinalIgnoreCase)) continue; try { var head = ReadEntryHeadUtf8(entry, HeaderReadLimit); var fmt = ClassifyJsonHeader(head); switch (fmt) { case JsonFormatDetector.Format.TopoJson: votes.TryGetValue("TopoJson", out var t); votes["TopoJson"] = t + 1; break; case JsonFormatDetector.Format.EsriJson: votes.TryGetValue("EsriJson", out var e); votes["EsriJson"] = e + 1; break; case JsonFormatDetector.Format.GeoJsonSeq: votes.TryGetValue("GeoJsonSeq", out var s); votes["GeoJsonSeq"] = s + 1; break; case JsonFormatDetector.Format.GeoJson: votes.TryGetValue("GeoJson", out var g); votes["GeoJson"] = g + 1; break; default: break; } } catch (Exception exEntry) { Log.Debug("JSON entry sniffing failed for '" + entry.Key + "': " + exEntry.Message); } } } if (votes.Count > 0) { Log.Debug("JSON votes: " + string.Join(", ", votes.Select(kv => kv.Key + "=" + kv.Value))); var max = votes.Values.Max(); var winners = votes.Where(kv => kv.Value == max).Select(kv => kv.Key).ToArray(); if (winners.Length == 1) { detectReason = "JSON voting majority (" + winners[0] + "=" + max + ") over entries: " + string.Join(", ", votes.Select(kv => kv.Key + "=" + kv.Value)); Log.Debug(detectReason); return factory.TryCreate(winners[0], out converter); } // friendly failure (ambiguous) detectReason = "ambiguous JSON in archive—please specify format"; Log.Warn("Ambiguous JSON types inside archive (tie in votes): " + string.Join(", ", votes.Select(kv => kv.Key + "=" + kv.Value))); converter = null; return false; } } catch (Exception ex) { Log.Debug("Failed to perform JSON-entry voting for archive '" + gisInputFilePath + "': " + ex.Message); // fall through to requirement heuristics below } } // Strict requirement match foreach (var kv in _s_archiveRequirements) { if (string.Equals(kv.Key, "Kmz", StringComparison.OrdinalIgnoreCase) && !kmzGuardPassed) continue; var required = kv.Value; if (required.All(r => exts.Contains(r))) { detectReason = "Requirement match: " + kv.Key; return factory.TryCreate(kv.Key, out converter); } } detectReason = "No archive-based converter match found (based on filename inspection)."; Log.Debug(detectReason); return false; } // Single-file handling var ext = (Path.GetExtension(gisInputFilePath) ?? string.Empty).ToLowerInvariant(); // Direct extension routing for explicit JSON-type extensions (no NDJSON sniff) // Note: TopoJSON commonly uses .json extension — rely on header sniff for .json files instead of requiring a dedicated suffix. if (_s_extensionToConverter.TryGetValue(ext, out var mapped) && (ext == ".geojson" || ext == ".esrijson")) { detectReason = $"Mapped extension '{ext}' to converter '{mapped}' (explicit mapping)."; return factory.TryCreate(mapped, out converter); } // For .json files run detection with NDJSON rule (mirrors ConversionService) if (!string.IsNullOrWhiteSpace(ext) && ext.EndsWith("json", StringComparison.OrdinalIgnoreCase)) { JsonFormatDetector.Format jsonFmt = JsonFormatDetector.Format.Unknown; string reason = null; try { jsonFmt = JsonFormatDetector.DetectFromFile(gisInputFilePath); if (jsonFmt != JsonFormatDetector.Format.Unknown) reason = "JsonFormatDetector.DetectFromFile"; } catch (Exception detEx) { Log.Debug("JsonFormatDetector.DetectFromFile threw: " + detEx.Message + ". Will attempt lightweight header sniff."); jsonFmt = JsonFormatDetector.Format.Unknown; } if (jsonFmt == JsonFormatDetector.Format.Unknown) { var head = ReadHeadUtf8(gisInputFilePath, HeaderReadLimit); jsonFmt = ClassifyJsonHeader(head); if (jsonFmt == JsonFormatDetector.Format.GeoJsonSeq) reason = $"Header sniff: NDJSON heuristic (>= {NdjsonThreshold} JSON lines)"; else if (jsonFmt == JsonFormatDetector.Format.TopoJson) reason = "Header sniff: TopoJSON fingerprint"; else if (jsonFmt == JsonFormatDetector.Format.EsriJson) reason = "Header sniff: EsriJSON fingerprint"; else if (jsonFmt == JsonFormatDetector.Format.GeoJson) reason = "Header sniff: GeoJSON fingerprint (Feature/coordinates/FeatureCollection)"; else reason = "Header sniff: unknown"; } if (jsonFmt == JsonFormatDetector.Format.Unknown) { detectReason = "Unable to determine JSON format (GeoJson / EsriJson / GeoJsonSeq / TopoJson)."; Log.Error(detectReason); return false; } string converterKeyForJson = null; switch (jsonFmt) { case JsonFormatDetector.Format.GeoJson: converterKeyForJson = "GeoJson"; break; case JsonFormatDetector.Format.EsriJson: converterKeyForJson = "EsriJson"; break; case JsonFormatDetector.Format.GeoJsonSeq: converterKeyForJson = "GeoJsonSeq"; break; case JsonFormatDetector.Format.TopoJson: converterKeyForJson = "TopoJson"; break; default: converterKeyForJson = null; break; } if (string.IsNullOrWhiteSpace(converterKeyForJson)) { detectReason = "Failed to map detected JSON format to a converter key."; Log.Error(detectReason); return false; } detectReason = $"Detected JSON format '{jsonFmt}' (reason: {reason})."; return factory.TryCreate(converterKeyForJson, out converter); } // Non-json extension mapping if (!_s_extensionToConverter.TryGetValue(ext, out var converterKeyNonJson)) { detectReason = $"Unknown input file extension '{ext}'"; Log.Warn(detectReason); return false; } detectReason = $"Mapped extension '{ext}' to converter '{converterKeyNonJson}' (extension mapping)."; return factory.TryCreate(converterKeyNonJson, out converter); } catch (Exception ex) { detectReason = "Unexpected error in TryCreateForInput: " + ex.Message; Log.Error(detectReason, ex); converter = null; return false; } } /// /// Heuristic to detect newline-delimited JSON (NDJSON / GeoJSON sequence) from the head text. /// /// Snippet of UTF-8 decoded file content (head bytes). /// Minimum JSON-like lines required to consider NDJSON. /// True when the text looks like NDJSON; otherwise false. /// /// Implementation notes: /// - Counts non-empty lines that start with '{' or '[' up to the provided threshold. /// - Stops early when a non-JSON-like line is encountered. /// - Designed to run on bounded header reads only (cheap, safe). /// private static bool LooksLikeNdjson(string text, int threshold = NdjsonThreshold) { if (string.IsNullOrWhiteSpace(text)) return false; int count = 0; using (var sr = new StringReader(text)) { string line; while ((line = sr.ReadLine()) != null) { line = line.Trim(); if (line.Length == 0) continue; if (line.StartsWith("{") || line.StartsWith("[")) { if (++count >= threshold) return true; } else { break; } } } return false; } /// /// Classify a JSON head string into a JSON GIS format. /// /// UTF-8 decoded head of the file / entry (bounded by HeaderReadLimit). /// Detected or when uncertain. /// /// - Uses simple substring heuristics that are intentionally conservative and case-insensitive. /// - The classifier prioritizes TopoJSON (presence of "type" + "topology"), then EsriJSON markers, /// then NDJSON heuristic, and finally GeoJSON object signatures. /// - Keep this small and deterministic to avoid false positives on truncated headers. /// private static JsonFormatDetector.Format ClassifyJsonHeader(string head) { if (string.IsNullOrWhiteSpace(head)) return JsonFormatDetector.Format.Unknown; if (head.IndexOf("\"type\"", StringComparison.OrdinalIgnoreCase) >= 0 && head.IndexOf("\"topology\"", StringComparison.OrdinalIgnoreCase) >= 0) return JsonFormatDetector.Format.TopoJson; if (head.IndexOf("\"spatialReference\"", StringComparison.OrdinalIgnoreCase) >= 0 || head.IndexOf("\"geometryType\"", StringComparison.OrdinalIgnoreCase) >= 0 || head.IndexOf("\"attributes\"", StringComparison.OrdinalIgnoreCase) >= 0) return JsonFormatDetector.Format.EsriJson; if (LooksLikeNdjson(head, NdjsonThreshold)) return JsonFormatDetector.Format.GeoJsonSeq; if (head.IndexOf("\"FeatureCollection\"", StringComparison.OrdinalIgnoreCase) >= 0 || head.IndexOf("\"Feature\"", StringComparison.OrdinalIgnoreCase) >= 0 || head.IndexOf("\"coordinates\"", StringComparison.OrdinalIgnoreCase) >= 0) return JsonFormatDetector.Format.GeoJson; return JsonFormatDetector.Format.Unknown; } /// /// Read up to bytes from the start of a file and return UTF-8 decoded text. /// /// Path to the file to read. /// Maximum number of bytes to read from the start of the file. /// UTF-8 decoded string of the bytes actually read, or empty string on error. /// /// - Uses FileShare.Read to allow other processes to read the file concurrently. /// - Returns an empty string on any exception and logs the error at Debug level. /// - Caller should pass a sensible (we use HeaderReadLimit). /// private static string ReadHeadUtf8(string path, int maxBytes) { try { using (var fs = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read)) { var toRead = (int)Math.Min(maxBytes, fs.Length); var buffer = new byte[toRead]; var read = fs.Read(buffer, 0, toRead); return Encoding.UTF8.GetString(buffer, 0, read); } } catch (Exception ex) { Log.Debug("ReadHeadUtf8: failed to read head of '" + path + "': " + ex.Message); return string.Empty; } } /// /// Read up to bytes from the start of an archive entry stream and return UTF-8 decoded text. /// /// Archive entry to read. /// Maximum number of bytes to read from the entry stream. /// UTF-8 decoded string of the bytes actually read, or empty string on error. /// /// - Per-entry reads are streaming (no extraction) and bounded to avoid excessive memory usage. /// - Returns an empty string on error and logs a Debug message. /// - Designed to be used only for header sniffing/classification; not for full file parsing. /// private static string ReadEntryHeadUtf8(IArchiveEntry entry, int maxBytes) { try { using (var s = entry.OpenEntryStream()) using (var ms = new MemoryStream()) { var buffer = new byte[8192]; int remaining = maxBytes; int read; while (remaining > 0 && (read = s.Read(buffer, 0, Math.Min(buffer.Length, remaining))) > 0) { ms.Write(buffer, 0, read); remaining -= read; } return Encoding.UTF8.GetString(ms.ToArray()); } } catch (Exception ex) { Log.Debug("ReadEntryHeadUtf8: failed to read entry '" + (entry?.Key ?? "") + "': " + ex.Message); return string.Empty; } } } }