using System; using System.Collections.Generic; using System.Text.RegularExpressions; namespace GitConverter.Lib.Factories { /// /// Helper utilities used by factories: key normalization and simple suggestion logic. /// /// /// /// FactoryHelpers provides small, deterministic utilities used by the converter factory and /// related test helpers to normalize user-supplied option keys and to produce friendly /// suggestions when an unknown option is supplied on the command line. /// /// /// Responsibilities /// - NormalizeKey: produce a compact lookup key that is tolerant to case, spacing, punctuation /// and surrounding quotes so a small registry can match a wide range of user input. /// - SuggestClosest: compute a best-effort human-friendly suggestion using normalized keys /// and Levenshtein edit distance, accepting only suggestions that meet a conservative /// threshold for plausibility. /// - LevenshteinDistance: efficient, space-optimized edit-distance calculation used by /// SuggestClosest. /// /// /// Design notes /// - The normalization intentionally limits characters to ASCII a-z and digits 0-9 to keep /// matching behavior predictable for canonical option names like "EsriJson" or "GeoJson". /// If the repository needs to support non-ASCII display keys update NormalizeKey accordingly. /// - SuggestClosest is designed for CLI UX improvements only and should not be used for /// security-related decisions. /// /// /// Testing guidance /// - Unit tests should cover exact normalized matches, acceptable near-miss suggestions, /// and cases where no suggestion is returned because the distance threshold is exceeded. /// - LevenshteinDistance is exposed as internal to allow direct unit testing via InternalsVisibleTo. /// /// public static class FactoryHelpers { // Precompiled regex - keep only ascii a-z and 0-9 after lowercasing. private static readonly Regex _s_normalizeRegex = new Regex("[^a-z0-9]", RegexOptions.Compiled | RegexOptions.CultureInvariant); /// /// Normalize a raw CLI key into a compact lookup key. /// Removes surrounding quotes, lowercases and strips non‑alphanumeric characters. /// /// User-supplied key (may include spaces, punctuation or quotes). /// Normalized compact key suitable for dictionary lookups. Returns empty string for null/whitespace input. /// /// /// Normalization strategy: /// - Trim surrounding whitespace and remove surrounding double quotes. /// - Convert to lower-case (invariant culture). /// - Remove any character that is not in the ASCII set [a-z0-9]. /// /// /// Rationale: /// The normalization makes user input tolerant to common variations such as "GeoJson", /// "geo json", "\"GeoJson\"", or "geo-json" and maps them to the same lookup key. /// Stripping non-ASCII characters is an intentional decision to simplify matching for the /// small canonical set of converter names used by this project. /// /// /// Note: if your display names include Unicode characters consider updating this method to /// perform Unicode normalization and transliteration instead of stripping. /// /// public static string NormalizeKey(string raw) { if (string.IsNullOrWhiteSpace(raw)) return string.Empty; var s = raw.Trim().Trim('"').ToLowerInvariant(); return _s_normalizeRegex.Replace(s, string.Empty); } /// /// Suggests the closest candidate from the provided collection for the given target string. /// Returns null when no reasonable suggestion is found. /// /// User-supplied key (possibly misspelled). /// Collection of available canonical keys (display names). /// /// A single best display name from , or null if no candidate is considered /// sufficiently close to the . /// /// /// /// Purpose /// - Provide a friendly CLI suggestion (for example: "Did you mean: EsriJson?") when users supply an /// unknown conversion option. /// /// /// Algorithm (summary) /// - Normalize both the target and candidates via to make comparisons /// tolerant to case, punctuation, spacing and surrounding quotes. /// - If any candidate's normalized form exactly matches the normalized target, return that candidate immediately. /// - Otherwise compute the Levenshtein edit distance between the normalized target and each normalized candidate, /// selecting the candidate with the smallest distance. /// - Apply a threshold to avoid low-quality suggestions: accept the best candidate only when /// distance <= max(1, target.Length / 3). Otherwise return null. /// /// /// Tie-breaking & ordering /// - When multiple candidates have the same minimal distance the first encountered candidate (iteration order /// of ) is returned. If stable selection is important, ensure callers provide /// candidates in a deterministic order. /// /// /// Performance & complexity /// - Let N be the number of candidates, T the normalized target length and L the typical candidate length. /// - Levenshtein distance is computed in O(L * T) time and O(max(L,T)) space. The implementation applies a /// simple length-difference heuristic to skip obviously distant candidates early. /// /// /// Locale and normalization caveats /// - Normalization currently strips non-ascii a-z and 0-9 characters. This intentionally simplifies matching /// for typical keys like "EsriJson" or "GeoJson". If your keys include non-ASCII characters you should /// update NormalizeKey to preserve or transliterate as required. /// /// /// Usage guidance and tests /// - Use for friendly CLI messages only — do not rely on this method for security/authorization decisions. /// - Unit tests should cover: /// - Exact normalized matches (different casing/spacing). /// - Near-miss matches (1-3 edits). /// - Inputs that should produce no suggestion (distance above threshold). /// - Tie scenarios to confirm deterministic selection. /// - Null/empty target and null candidates collection behavior. /// /// /// Alternatives / enhancements /// - To return multiple candidates consider returning a ranked list instead of a single best match. /// - For large candidate sets consider using BK-trees or other approximate-string-search indexes to improve /// lookup performance. /// /// public static string SuggestClosest(string target, IEnumerable candidates) { if (string.IsNullOrWhiteSpace(target)) return null; if (candidates == null) return null; var t = NormalizeKey(target); // Precompute normalized candidates and preserve display names; skip null/whitespace. var list = new List<(string display, string norm)>(); foreach (var c in candidates) { if (string.IsNullOrWhiteSpace(c)) continue; list.Add((c, NormalizeKey(c))); } if (list.Count == 0) return null; // Fast exact normalized match foreach (var pair in list) if (pair.norm == t) return pair.display; string best = null; var bestScore = int.MaxValue; foreach (var pair in list) { var lenDiff = Math.Abs(pair.norm.Length - t.Length); if (lenDiff > bestScore) continue; var dist = LevenshteinDistance(t, pair.norm); if (dist < bestScore) { bestScore = dist; best = pair.display; if (bestScore == 0) break; } } var threshold = Math.Max(1, t.Length / 3); return bestScore <= threshold ? best : null; } /// /// Compute Levenshtein edit distance (space-optimized). /// Made internal so unit tests (via InternalsVisibleTo) can call it directly without reflection. /// /// First (normalized) string. /// Second (normalized) string. /// /// The edit distance (minimum number of single-character insertions, deletions or substitutions) /// required to transform into . /// /// /// /// Implementation details /// - This implementation is a space-optimized dynamic programming variant. /// - It maintains only two one-dimensional arrays (previous and current row) rather than the full /// two-dimensional matrix. Memory usage is therefore O(m) where m == b.Length. /// - Time complexity is O(n * m) where n == a.Length and m == b.Length. /// /// /// Behavior and edge cases /// - Null inputs are treated as empty strings. /// - If one string is empty the distance equals the length of the other string (all insertions or deletions). /// /// /// Performance notes /// - For very long strings (thousands of characters) the O(n * m) cost can be significant; /// consider early exits, limiting candidate lengths, or indexing structures for large candidate sets. /// - The current calling usage normalizes short CLI keys so practical inputs are typically small and performant. /// /// /// Testing guidance /// - Unit tests should validate: /// - Known pairs (e.g., ("kitten","sitting") => 3). /// - Empty vs non-empty strings. /// - Symmetry: distance(a,b) == distance(b,a). /// - Behavior with Unicode/extended characters if NormalizeKey is adapted. /// /// internal static int LevenshteinDistance(string a, string b) { // Treat null as empty for predictable behavior. if (a == null) a = string.Empty; if (b == null) b = string.Empty; // lengths of the inputs var n = a.Length; var m = b.Length; // If either string is empty, distance is the length of the other (all insertions or deletions). if (n == 0) return m; if (m == 0) return n; // 'prev' holds the previous row of DP values; initialized to 0..m var prev = new int[m + 1]; for (var j = 0; j <= m; j++) prev[j] = j; // iterate rows (characters of a) for (var i = 1; i <= n; i++) { // 'cur' holds the current row; cur[0] == i because converting first i chars of 'a' to empty b requires i deletions var cur = new int[m + 1]; cur[0] = i; // iterate columns (characters of b) for (var j = 1; j <= m; j++) { // cost = 0 if characters match, else 1 for substitution var cost = (a[i - 1] == b[j - 1]) ? 0 : 1; // compute minimal cost among: // - insertion (cur[j - 1] + 1) // - deletion (prev[j] + 1) // - substitution (prev[j - 1] + cost) cur[j] = Math.Min(Math.Min(cur[j - 1] + 1, prev[j] + 1), prev[j - 1] + cost); } // move current row to previous for next iteration (space optimization) prev = cur; } // final cell contains the edit distance return prev[m]; } } }