using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using GitConverter.Lib.Converters;
using GitConverter.Lib.Logging;
using SharpCompress.Archives;
namespace GitConverter.Lib.Factories
{
///
/// Provides extension methods for automatic GIS format detection and converter resolution.
/// Supports both single files and archives with header-based classification for ambiguous formats.
///
public static class ConverterFactoryInputExtensions
{
///
/// Minimum number of consecutive JSON lines required to classify content as NDJSON format.
///
private const int NdjsonThreshold = 2;
///
/// Maximum bytes to read from file headers for format classification (8 KB).
/// Sufficient to classify JSON structure without loading entire files into memory.
///
private const int HeaderReadLimit = 8 * 1024;
///
/// Minimum header size required for reliable JSON classification (512 bytes).
/// Based on typical structure sizes:
/// - GeoJSON feature: ~100-200 bytes (type + coordinates)
/// - EsriJSON feature: ~150-300 bytes (spatialReference + attributes)
/// - TopoJSON: ~200-400 bytes (topology header)
/// - NDJSON: 2+ lines × 50 bytes average = 100+ bytes
/// Provides safety margin for minified or compact JSON.
///
private const int MinJsonParseBytes = 512;
///
/// Maximum number of non-JSON lines allowed in NDJSON content.
/// Permits comments and blank lines while maintaining format integrity.
///
private const int MaxNonJsonLinesInNdjson = 2;
///
/// Centralized format descriptors for all supported GIS formats.
/// Maps format names to their file extensions and archive requirements.
///
private static readonly IReadOnlyDictionary Formats =
new Dictionary(StringComparer.OrdinalIgnoreCase)
{
{ "GeoJson", new FormatDescriptor("GeoJson", new[] { ".geojson" }, new[] { ".geojson" }) },
{ "EsriJson", new FormatDescriptor("EsriJson", new[] { ".esrijson" }, new[] { ".esrijson" }) },
{ "GeoJsonSeq", new FormatDescriptor("GeoJsonSeq", new[] { ".jsonl", ".ndjson" }, Array.Empty()) },
{ "TopoJson", new FormatDescriptor("TopoJson", new[] { ".topojson" }, Array.Empty()) },
{ "Kml", new FormatDescriptor("Kml", new[] { ".kml" }, new[] { ".kml" }) },
{ "Kmz", new FormatDescriptor("Kmz", new[] { ".kmz" }, new[] { ".kml" }) },
{ "Shapefile", new FormatDescriptor("Shapefile", new[] { ".shp" }, new[] { ".shp", ".shx", ".dbf" }) },
{ "Osm", new FormatDescriptor("Osm", new[] { ".osm" }, new[] { ".osm" }) },
{ "Gpx", new FormatDescriptor("Gpx", new[] { ".gpx" }, new[] { ".gpx" }) },
{ "Gml", new FormatDescriptor("Gml", new[] { ".gml" }, new[] { ".gml" }) },
{ "Gdb", new FormatDescriptor("Gdb", new[] { ".gdb" }, new[] { ".gdb" }) },
{ "MapInfoInterchange", new FormatDescriptor("MapInfoInterchange", new[] { ".mif" }, new[] { ".mif" }) },
{ "MapInfoTab", new FormatDescriptor("MapInfoTab", new[] { ".tab", ".map", ".dat", ".id" }, new[] { ".tab", ".dat", ".map", ".id" }) },
{ "Csv", new FormatDescriptor("Csv", new[] { ".csv" }, new[] { ".csv" }) },
{ "GeoPackage", new FormatDescriptor("GeoPackage", new[] { ".gpkg" }, new[] { ".gpkg" }) },
};
///
/// Describes a GIS format with associated file extensions and archive validation requirements.
///
internal sealed class FormatDescriptor
{
///
/// Gets the format name (e.g., "GeoJson", "Shapefile").
///
public string Name { get; }
///
/// Gets the list of file extensions associated with this format.
///
public IReadOnlyList FileExtensions { get; }
///
/// Gets the list of required extensions that must be present in an archive for format validation.
///
public IReadOnlyList ArchiveRequirements { get; }
///
/// Initializes a new instance of the class.
///
/// The format name.
/// The file extensions for this format.
/// The required extensions for archive validation.
/// Thrown when is null.
public FormatDescriptor(string name, string[] fileExts, string[] archiveReqs)
{
Name = name ?? throw new ArgumentNullException(nameof(name));
FileExtensions = fileExts ?? Array.Empty();
ArchiveRequirements = archiveReqs ?? Array.Empty();
}
///
/// Determines whether the specified extension matches any of this format's file extensions.
///
/// The file extension to check.
/// true if the extension matches; otherwise, false.
public bool MatchesFileExtension(string extension)
{
return FileExtensions.Contains(extension, StringComparer.OrdinalIgnoreCase);
}
///
/// Determines whether all archive requirements are satisfied by the discovered extensions.
///
/// The set of extensions found in the archive.
/// true if all requirements are met; otherwise, false.
public bool MatchesArchiveRequirements(ISet discoveredExtensions)
{
return ArchiveRequirements.All(req => discoveredExtensions.Contains(req));
}
}
///
/// Attempts to create a converter for the specified GIS input file path.
/// Lightweight overload without diagnostic information.
///
/// The converter factory to use.
/// The path to the GIS input file.
/// When successful, contains the created converter; otherwise, null.
/// true if converter creation succeeded; otherwise, false.
public static bool TryCreateForInput(
this IConverterFactory factory,
string gisInputFilePath,
out IConverter converter)
{
return TryCreateForInput(factory, gisInputFilePath, out converter, out _);
}
///
/// Attempts to create a converter for the specified GIS input file path with detailed diagnostic information.
///
/// The converter factory to use.
/// The path to the GIS input file.
/// When successful, contains the created converter; otherwise, null.
/// Contains diagnostic information about the detection process.
/// true if converter creation succeeded; otherwise, false.
/// Thrown when is null.
///
/// Detection priority:
/// 1. Archives: Fast-path extension checks → KMZ detection → JSON voting → requirement matching
/// 2. Single files: Explicit extension mapping → JSON format detection (with fallback) → generic extension mapping
///
/// All file reads are bounded and streaming. Archives are never extracted to disk.
///
public static bool TryCreateForInput(
this IConverterFactory factory,
string gisInputFilePath,
out IConverter converter,
out string detectReason)
{
converter = null;
detectReason = null;
if (factory == null) throw new ArgumentNullException(nameof(factory));
// Validate input path is not null or empty
if (string.IsNullOrWhiteSpace(gisInputFilePath))
{
detectReason = "input path is null or whitespace";
Log.Error($"TryCreateForInput: {detectReason}");
return false;
}
// Validate file exists
if (!File.Exists(gisInputFilePath))
{
detectReason = $"file does not exist: {gisInputFilePath}";
Log.Error($"TryCreateForInput: {detectReason}");
return false;
}
// Validate file is not empty
FileInfo fileInfo;
try
{
fileInfo = new FileInfo(gisInputFilePath);
}
catch (Exception ex)
{
detectReason = $"failed to get file info: {ex.Message}";
Log.Error($"TryCreateForInput: {detectReason}", ex);
return false;
}
if (fileInfo.Length == 0)
{
detectReason = "file is empty (0 bytes)";
Log.Warn($"TryCreateForInput: {detectReason}");
return false;
}
try
{
return ConverterUtils.IsArchiveFile(gisInputFilePath)
? TryDetectArchiveFormat(factory, gisInputFilePath, out converter, out detectReason)
: TryDetectSingleFileFormat(factory, gisInputFilePath, out converter, out detectReason);
}
catch (Exception ex)
{
detectReason = $"unexpected error during format detection: {ex.GetType().Name} - {ex.Message}";
Log.Error(detectReason, ex);
return false;
}
}
///
/// Attempts to detect the GIS format from an archive file without extraction.
///
/// The converter factory to use.
/// The path to the archive file.
/// When successful, contains the created converter; otherwise, null.
/// Contains diagnostic information about the detection process.
/// true if format detection and converter creation succeeded; otherwise, false.
private static bool TryDetectArchiveFormat(
IConverterFactory factory,
string archivePath,
out IConverter converter,
out string detectReason)
{
converter = null;
var entries = ConverterUtils.TryListArchiveEntries(archivePath);
if (entries == null || !entries.Any())
{
detectReason = "failed to list archive entries or archive is empty";
Log.Warn($"archive detection failed: {detectReason}");
return false;
}
var discoveredExts = CollectExtensionsFromEntries(entries, out var hasTopLevelDocKml);
var outerExt = (Path.GetExtension(archivePath) ?? string.Empty).ToLowerInvariant();
// Fast-path: Explicit JSON variant extensions bypass voting
if (TryMatchExplicitJsonExtension(discoveredExts, out var jsonFormat))
{
detectReason = $"archive contains {jsonFormat} entries";
return factory.TryCreate(jsonFormat, out converter);
}
// KMZ detection via outer extension or presence of doc.kml
if (IsKmzArchive(outerExt, hasTopLevelDocKml))
{
detectReason = outerExt == ".kmz"
? "archive has .kmz extension"
: "archive contains top-level doc.kml (KMZ structure)";
return factory.TryCreate("Kmz", out converter);
}
// Generic .json files require header-based voting
if (discoveredExts.Contains(".json"))
{
var jsonEntries = entries.Where(e => e.EndsWith(".json", StringComparison.OrdinalIgnoreCase));
var voteResult = VoteOnJsonEntries(archivePath, jsonEntries);
if (voteResult.IsSuccess)
{
detectReason = $"json voting: {voteResult.Reason}";
Log.Debug($"archive json detection: {detectReason}");
return factory.TryCreate(voteResult.Winner, out converter);
}
detectReason = $"json format ambiguous in archive: {voteResult.Reason}";
Log.Warn(detectReason);
return false;
}
// Fallback: Strict requirement matching for non-JSON formats
foreach (var fmt in Formats.Values.Where(f => f.ArchiveRequirements.Count > 0 && f.Name != "Kmz"))
{
if (fmt.MatchesArchiveRequirements(discoveredExts))
{
detectReason = $"archive requirements met for {fmt.Name}: {string.Join(", ", fmt.ArchiveRequirements)}";
return factory.TryCreate(fmt.Name, out converter);
}
}
detectReason = $"no format matched archive contents (extensions found: {string.Join(", ", discoveredExts)})";
Log.Warn(detectReason);
return false;
}
///
/// Attempts to detect the GIS format from a single file.
///
/// The converter factory to use.
/// The path to the file.
/// When successful, contains the created converter; otherwise, null.
/// Contains diagnostic information about the detection process.
/// true if format detection and converter creation succeeded; otherwise, false.
private static bool TryDetectSingleFileFormat(
IConverterFactory factory,
string filePath,
out IConverter converter,
out string detectReason)
{
converter = null;
var ext = (Path.GetExtension(filePath) ?? string.Empty).ToLowerInvariant();
if (string.IsNullOrEmpty(ext))
{
detectReason = "file has no extension";
Log.Warn(detectReason);
return false;
}
// Fast-path: Explicit non-JSON extensions
var explicitFormat = Formats.Values.FirstOrDefault(f =>
f.MatchesFileExtension(ext) && !ext.Contains("json"));
if (explicitFormat != null)
{
detectReason = $"extension '{ext}' mapped to {explicitFormat.Name}";
return factory.TryCreate(explicitFormat.Name, out converter);
}
// JSON variants require content inspection
if (ext.EndsWith("json", StringComparison.OrdinalIgnoreCase))
{
var jsonFormat = DetectJsonFormat(filePath, out var jsonReason);
if (jsonFormat == JsonFormatDetector.Format.Unknown)
{
detectReason = $"json format could not be determined: {jsonReason}";
Log.Error(detectReason);
return false;
}
var converterKey = MapJsonFormatToConverter(jsonFormat);
if (string.IsNullOrEmpty(converterKey))
{
detectReason = $"no converter mapping for json format: {jsonFormat}";
Log.Error(detectReason);
return false;
}
detectReason = $"detected json format: {jsonFormat} ({jsonReason})";
return factory.TryCreate(converterKey, out converter);
}
detectReason = $"unknown file extension '{ext}'";
Log.Warn(detectReason);
return false;
}
///
/// Collects all file extensions from archive entries, including nested paths and .gdb folders.
///
/// The archive entry paths to analyze.
/// Set to true if a top-level doc.kml file is found (KMZ indicator).
/// A set of lowercase file extensions found in the archive.
private static HashSet CollectExtensionsFromEntries(
IEnumerable entries,
out bool hasTopLevelDocKml)
{
var extensions = new HashSet(StringComparer.OrdinalIgnoreCase);
hasTopLevelDocKml = false;
foreach (var entryPath in entries)
{
if (string.IsNullOrWhiteSpace(entryPath)) continue;
var normalized = entryPath.Replace('\\', '/').Trim('/');
// Check for top-level doc.kml (KMZ indicator)
if (string.Equals(normalized, "doc.kml", StringComparison.OrdinalIgnoreCase))
hasTopLevelDocKml = true;
// Extract all extensions from path segments (handles .gdb folders)
var segments = normalized.Split(new[] { '/' }, StringSplitOptions.RemoveEmptyEntries);
foreach (var segment in segments)
{
var ext = Path.GetExtension(segment);
if (!string.IsNullOrEmpty(ext))
extensions.Add(ext.ToLowerInvariant());
}
}
return extensions;
}
///
/// Checks for explicit JSON variant extensions that bypass voting.
///
/// The set of extensions to check.
/// When successful, contains the format name; otherwise, null.
/// true if an explicit JSON extension was found; otherwise, false.
private static bool TryMatchExplicitJsonExtension(
ISet extensions,
out string formatName)
{
formatName = null;
var explicitJsonFormats = new[]
{
(".geojson", "GeoJson"),
(".esrijson", "EsriJson"),
(".topojson", "TopoJson"),
(".jsonl", "GeoJsonSeq"),
(".ndjson", "GeoJsonSeq")
};
foreach (var (ext, format) in explicitJsonFormats)
{
if (extensions.Contains(ext))
{
formatName = format;
return true;
}
}
return false;
}
///
/// Determines whether an archive is KMZ format based on outer extension or doc.kml presence.
///
/// The archive file's extension.
/// Whether a top-level doc.kml file was found.
/// true if the archive is KMZ format; otherwise, false.
private static bool IsKmzArchive(string outerExtension, bool hasTopLevelDocKml)
{
return outerExtension == ".kmz" || hasTopLevelDocKml;
}
///
/// Performs header-based voting on JSON entries to determine the dominant format.
/// Uses format-specific tiebreaker when multiple formats receive equal votes.
///
/// The path to the archive file.
/// The JSON entry paths to vote on.
/// A containing the winning format or failure reason.
private static VoteResult VoteOnJsonEntries(string archivePath, IEnumerable jsonEntries)
{
var votes = new Dictionary(StringComparer.OrdinalIgnoreCase);
try
{
using (var archive = ArchiveFactory.Open(archivePath))
{
foreach (var entry in archive.Entries.Where(e => !e.IsDirectory))
{
var entryName = entry.Key ?? string.Empty;
if (!jsonEntries.Contains(entryName, StringComparer.OrdinalIgnoreCase)) continue;
try
{
var header = ReadEntryHeaderUtf8(entry, HeaderReadLimit);
var format = ClassifyJsonContent(header);
if (format != JsonFormatDetector.Format.Unknown)
{
var converterKey = MapJsonFormatToConverter(format);
if (!string.IsNullOrEmpty(converterKey))
{
votes.TryGetValue(converterKey, out var count);
votes[converterKey] = count + 1;
}
}
}
catch (Exception ex)
{
Log.Debug($"failed to classify json entry '{entryName}': {ex.Message}");
}
}
}
}
catch (Exception ex)
{
return VoteResult.Failure($"archive reading failed: {ex.Message}");
}
if (votes.Count == 0)
{
return VoteResult.Failure("no json entries could be classified (all unknown or corrupted)");
}
var maxVotes = votes.Values.Max();
var winners = votes.Where(kv => kv.Value == maxVotes).Select(kv => kv.Key).ToArray();
// Tiebreaker: Prefer more specific/constrained formats over generic ones
// Priority: EsriJson (most specific) > TopoJson > GeoJson > GeoJsonSeq (most generic)
var tiebreakPriority = new[] { "EsriJson", "TopoJson", "GeoJson", "GeoJsonSeq" };
var winner = tiebreakPriority.FirstOrDefault(p => winners.Contains(p, StringComparer.OrdinalIgnoreCase))
?? winners.OrderBy(w => w).First(); // Fallback to alphabetical
var voteDetails = string.Join(", ", votes.Select(kv => $"{kv.Key}={kv.Value}"));
var reason = winners.Length > 1
? $"{winner} wins {maxVotes}-vote tie via specificity tiebreaker (candidates: {string.Join(", ", winners)}; votes: {voteDetails})"
: $"{winner} wins with {maxVotes}/{votes.Values.Sum()} votes ({voteDetails})";
return VoteResult.Success(winner, reason);
}
///
/// Encapsulates the result of a voting operation.
///
private sealed class VoteResult
{
///
/// Gets a value indicating whether the voting succeeded.
///
public bool IsSuccess { get; }
///
/// Gets the winning format name, or null if voting failed.
///
public string Winner { get; }
///
/// Gets the diagnostic reason describing the vote outcome.
///
public string Reason { get; }
private VoteResult(bool success, string winner, string reason)
{
IsSuccess = success;
Winner = winner;
Reason = reason;
}
///
/// Creates a successful vote result.
///
public static VoteResult Success(string winner, string reason) => new VoteResult(true, winner, reason);
///
/// Creates a failed vote result.
///
public static VoteResult Failure(string reason) => new VoteResult(false, null, reason);
}
///
/// Detects JSON format from a file using a fallback chain: JsonFormatDetector → header sniff.
///
/// The path to the JSON file.
/// Contains diagnostic information about the detection method used.
/// The detected JSON format, or if detection failed.
private static JsonFormatDetector.Format DetectJsonFormat(string filePath, out string reason)
{
// Primary: Use JsonFormatDetector for full parsing
try
{
var format = JsonFormatDetector.DetectFromFile(filePath);
if (format != JsonFormatDetector.Format.Unknown)
{
reason = "JsonFormatDetector.DetectFromFile (full parse)";
return format;
}
}
catch (Exception ex)
{
Log.Debug($"JsonFormatDetector.DetectFromFile failed: {ex.Message}; falling back to header sniff");
}
// Fallback: Bounded header read (handles large files)
var header = ReadFileHeaderUtf8(filePath, HeaderReadLimit);
var sniffed = ClassifyJsonContent(header);
reason = sniffed switch
{
JsonFormatDetector.Format.GeoJsonSeq => $"header sniff: ndjson pattern (>= {NdjsonThreshold} json lines)",
JsonFormatDetector.Format.TopoJson => "header sniff: topology object detected",
JsonFormatDetector.Format.EsriJson => "header sniff: esri json markers detected",
JsonFormatDetector.Format.GeoJson => "header sniff: geojson object detected",
_ => "header sniff: format unknown"
};
return sniffed;
}
///
/// Classifies JSON content from a bounded header string using substring-based heuristics.
///
/// The JSON content to classify.
/// The detected JSON format, or if classification failed.
///
/// Detection is heuristic-based and may produce false positives on:
/// - Truncated headers (less than )
/// - Keywords appearing in string values or comments
/// - Unusual JSON structures
///
/// Priority order minimizes false positives:
/// 1. TopoJSON (most distinctive: requires "type" + "Topology")
/// 2. EsriJSON (distinctive: spatialReference OR geometryType)
/// 3. NDJSON (structural: multiple JSON objects per line)
/// 4. GeoJSON (common: FeatureCollection/Feature/coordinates)
///
private static JsonFormatDetector.Format ClassifyJsonContent(string content)
{
if (string.IsNullOrWhiteSpace(content) || content.Length < MinJsonParseBytes)
return JsonFormatDetector.Format.Unknown;
// Priority 1: TopoJSON (rare but highly distinctive)
if (content.IndexOf("\"type\"", StringComparison.OrdinalIgnoreCase) >= 0 &&
content.IndexOf("\"Topology\"", StringComparison.Ordinal) >= 0)
return JsonFormatDetector.Format.TopoJson;
// Priority 2: EsriJSON (distinctive properties)
// Require spatialReference OR geometryType (both are Esri-specific)
bool hasSpatialRef = content.IndexOf("\"spatialReference\"", StringComparison.OrdinalIgnoreCase) >= 0;
bool hasGeometryType = content.IndexOf("\"geometryType\"", StringComparison.OrdinalIgnoreCase) >= 0;
if (hasSpatialRef || hasGeometryType)
return JsonFormatDetector.Format.EsriJson;
// Priority 3: NDJSON (structural pattern)
if (LooksLikeNdjson(content, NdjsonThreshold))
return JsonFormatDetector.Format.GeoJsonSeq;
// Priority 4: GeoJSON (most common, check last)
if (content.IndexOf("\"FeatureCollection\"", StringComparison.OrdinalIgnoreCase) >= 0 ||
content.IndexOf("\"Feature\"", StringComparison.OrdinalIgnoreCase) >= 0 ||
content.IndexOf("\"coordinates\"", StringComparison.OrdinalIgnoreCase) >= 0)
return JsonFormatDetector.Format.GeoJson;
return JsonFormatDetector.Format.Unknown;
}
///
/// Determines whether content resembles NDJSON (newline-delimited JSON) format.
/// Allows limited non-JSON lines to handle comments and whitespace.
///
/// The content to analyze.
/// The minimum number of JSON lines required.
/// true if content appears to be NDJSON format; otherwise, false.
private static bool LooksLikeNdjson(string content, int threshold)
{
if (string.IsNullOrWhiteSpace(content)) return false;
int jsonLineCount = 0;
int nonJsonLineCount = 0;
using (var reader = new StringReader(content))
{
string line;
while ((line = reader.ReadLine()) != null)
{
var trimmed = line.Trim();
if (trimmed.Length == 0) continue; // Skip blank lines
// Lines must start with { or [ to qualify as JSON
if (trimmed[0] == '{' || trimmed[0] == '[')
{
if (++jsonLineCount >= threshold)
return true;
}
else
{
// Allow limited non-JSON lines (comments, headers)
if (++nonJsonLineCount > MaxNonJsonLinesInNdjson)
return false;
}
}
}
return jsonLineCount >= threshold;
}
///
/// Maps enum values to converter key strings.
///
/// The JSON format to map.
/// The converter key string, or null for unknown formats.
private static string MapJsonFormatToConverter(JsonFormatDetector.Format format)
{
return format switch
{
JsonFormatDetector.Format.GeoJson => "GeoJson",
JsonFormatDetector.Format.EsriJson => "EsriJson",
JsonFormatDetector.Format.GeoJsonSeq => "GeoJsonSeq",
JsonFormatDetector.Format.TopoJson => "TopoJson",
_ => null // Unknown format
};
}
///
/// Reads a bounded header from a file using streaming I/O (no full file load).
///
/// The file path to read from.
/// The maximum number of bytes to read.
/// The UTF-8 decoded header content, or an empty string on error.
private static string ReadFileHeaderUtf8(string path, int maxBytes)
{
try
{
using (var stream = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read, bufferSize: 4096))
{
var bytesToRead = (int)Math.Min(maxBytes, stream.Length);
var buffer = new byte[bytesToRead];
var bytesRead = stream.Read(buffer, 0, bytesToRead);
return Encoding.UTF8.GetString(buffer, 0, bytesRead);
}
}
catch (Exception ex)
{
Log.Debug($"ReadFileHeaderUtf8 failed for '{path}': {ex.Message}");
return string.Empty;
}
}
///
/// Reads a bounded header from an archive entry using streaming I/O (no extraction to disk).
///
/// The archive entry to read from.
/// The maximum number of bytes to read.
/// The UTF-8 decoded header content, or an empty string on error.
private static string ReadEntryHeaderUtf8(IArchiveEntry entry, int maxBytes)
{
try
{
using (var stream = entry.OpenEntryStream())
using (var buffer = new MemoryStream())
{
var temp = new byte[4096];
int remaining = maxBytes;
int read;
while (remaining > 0 && (read = stream.Read(temp, 0, Math.Min(temp.Length, remaining))) > 0)
{
buffer.Write(temp, 0, read);
remaining -= read;
}
return Encoding.UTF8.GetString(buffer.ToArray());
}
}
catch (Exception ex)
{
Log.Debug($"ReadEntryHeaderUtf8 failed for '{entry?.Key ?? ""}': {ex.Message}");
return string.Empty;
}
}
}
}