diff --git a/src/algorithms/bwt/bwtDecode.ts b/src/algorithms/bwt/bwtDecode.ts index bd7e5b2..b44feb0 100644 --- a/src/algorithms/bwt/bwtDecode.ts +++ b/src/algorithms/bwt/bwtDecode.ts @@ -4,12 +4,11 @@ import { END_OF_STRING } from "./utils/constants"; /** * Decodes a string that was encoded using the Burrows-Wheeler Transform (BWT). * - * 1. Create a createFrequencyMap for chars from input string. * * 2. Construct a mapping (`firstOccurrenceTable`) that indicates the starting position of each character in the sorted BWT string. * - * 3. Compute the index of each character in the BWT string as it appears in the sorted BWT string. + * 3. Compute the index of each character in the BWT string as it appears in the sorted BWT string. * * 4. Using the `charsIndexesInSortedInput` array, trace back from the given index to reconstruct the original string. * diff --git a/src/algorithms/bwt/bwtEncode.ts b/src/algorithms/bwt/bwtEncode.ts index f72ea32..f2299e9 100644 --- a/src/algorithms/bwt/bwtEncode.ts +++ b/src/algorithms/bwt/bwtEncode.ts @@ -1,39 +1,41 @@ import { END_OF_STRING } from "./utils/constants"; +import { buildSuffixArray } from "./utils/suffix-array"; /** * Encodes a given string using the Burrows-Wheeler Transform (BWT). * - * The function virtualizes all cyclic permutations of the input string, sorts them - * lexicographically, and constructs the BWT string from the last characters of each - * permutation. It also returns the index of the original string in the sorted list of permutations. + * This function constructs the suffix array for the input string using a + * prefix doubling algorithm with radix sort, which operates in O(n log n) time. + * The BWT string is then derived from the suffix array by taking the character + * preceding each suffix. * - * @param {string} input - The string to be encoded using the Burrows-Wheeler Transform. + * @param {string} input - The string to be encoded using the BWT. * @returns {{ bwt: string; index: number }} An object containing the encoded BWT string and the index of the original string. */ export function bwtEncode(input: string): { bwt: string; index: number } { input += END_OF_STRING; const length = input.length; - const permutationsIndexes = Array.from({ length }, (_, i) => i); - - // sorting all permutations - // using indexes - // without creating actual strings - permutationsIndexes.sort((a, b) => { - for (let i = 0; i < length; i++) { - const charA = input[(a + i) % length]; - const charB = input[(b + i) % length]; - if (charA < charB) return -1; - if (charA > charB) return 1; - } - return 0; - }); + // To construct the BWT string, we utilize the suffix array. + // The suffix array contains the starting indixes of all suffixes of the string, sorted in lexicographical order. + // This sorted order helps us build the BWT string by accessing the characters just before the start of each suffix + // in the cyclic permutation of the string. + const suffixArray = buildSuffixArray(input); const bwtResult: string[] = []; let originalIndex = -1; + // For each suffix indexed in the suffix array, we find the character that precedes it in this cyclic view. + // The formula (index + length - 1) % length computes the position of this preceding character: + // - `index` is the start of the current suffix in the sorted suffix array. + // - Add length - 1 to the index to move to the preceding position in the cyclic permutation. + // - Use modulo length to wrap around to the start of the string if needed. + // Example: For a suffix starting at index 5 in a string of length 6: + // - Calculate the preceding character position: (5 + 6 - 1) % 6 = 10 % 6 = 4 + // - The character at index 4 in the string is the one just before the suffix starting at index 5. + // Collecting these preceding characters for all suffixes in the sorted order gives us the BWT string. for (let i = 0; i < length; i++) { - const index = permutationsIndexes[i]; + const index = suffixArray[i]; bwtResult.push(input[(index + length - 1) % length]); if (index === 0) { originalIndex = i; diff --git a/src/algorithms/bwt/utils/suffix-array.ts b/src/algorithms/bwt/utils/suffix-array.ts new file mode 100644 index 0000000..e179fcd --- /dev/null +++ b/src/algorithms/bwt/utils/suffix-array.ts @@ -0,0 +1,122 @@ +export function buildSuffixArray(input: string): number[] { + const length = input.length; + // The `ranks` array holds the `rank` of each suffix. + // As comparisonRange increases, these ranks begin to represent the order of suffixes based + // on multiple characters. + // + // Example for the string "banana": + // Initially, `ranks` contains ASCII codes of the characters: + // String: "banana" + // Indexes: 0 1 2 3 4 5 + // ranks: [98, 97, 110, 97, 110, 97] + // + // After the first iteration (`comparisonSpan = 1`), ranks get updated: + // Now `ranks` represents the relative order of suffixes based on the first character + // (and the second character, if available): [1, 0, 2, 0, 2, 0] + let ranks = new Array(length); + // `nextRanks` is a temp array where new ranks are stored after recalculating them during + // each iteration. After updating, these ranks will be copied back to ranks for the next iteration. + let nextRanks = new Array(length); + // `comparisonRange` controls how many characters we compare in each iteration. Initially, it’s set to 1 + // (compare only the first character), then it doubles (compare first two characters, + // then four, and so on) until all characters in the suffix are considered. + let comparisonRange = 1; + + // indexes of sorted suffixes + const sortedSuffixes = new Array(length); + + // set initial values for `ranks` and `orderedSuffixes` + for (let i = 0; i < length; i++) { + ranks[i] = input.charCodeAt(i); + sortedSuffixes[i] = i; + } + + while (comparisonRange < length) { + // Step 1: Sort suffixes based on ranks starting `comparisonRange` positions ahead. + // Example: Suppose `comparisonRange` = 1. + // - `ranks` = [98, 97, 110, 97, 110, 97] (character codes for suffixes). + // - Suffixes are sorted based on ranks of positions one ahead: + // - For suffix starting at index 0, rank is ranks[(0 + 1) % 6] = ranks[1] = 97. + // - For suffix starting at index 1, rank is ranks[(1 + 1) % 6] = ranks[2] = 110. + // - Continue similarly for all suffixes. + // - Sorting the suffix array based on these "ahead" ranks gives us a new order. + radixSort(sortedSuffixes, (i) => ranks[(i + comparisonRange) % length]); + // Step 2: Sort suffixes based on current ranks. + // Example: Suppose after the previous sorting, the ranks are as follows: + // - `ranks` = [1, 0, 2, 0, 2, 0] (ranks of suffixes based on initial comparison). + // - Sorting `suffixArray` directly by these ranks would order suffixes based on current rank values. + radixSort(sortedSuffixes, (i) => ranks[i]); + recalculateRanks(ranks, nextRanks, sortedSuffixes, comparisonRange, length); + + [ranks, nextRanks] = [nextRanks, ranks]; + comparisonRange *= 2; + } + + return sortedSuffixes; +} + +/** + * Sorts the `suffixIndexes` array using radix sort based on the ranks of suffixes. + * @param suffixIndexes - An array of suffix indices to be sorted. + * @param getRank - A function that returns the rank of a given suffix index. + */ +const radixSort = ( + sortedSuffixes: number[], + getRank: (index: number) => number +) => { + const buckets: number[][] = Array.from({ length: 16 }, () => []); + + let bitPointer = 0; + let maxValue = Math.max(...sortedSuffixes.map(getRank)); + const bitsCount = Math.ceil(Math.log2(maxValue + 1)); + + // We sort suffix indexes using radix sort, processing the rank bits in chunks of 4 bits. + // 1. Loop over `suffixIndexes` and determine the rank for each suffix. + // 2. Extract 4 bits from the rank, starting from `bitPointer`. + // 3. Use these 4 bits to decide the bucket where the suffix index will go. + // 4. After distributing all suffixes into buckets based on the current 4-bit segment, + // move to the next 4-bit segment by incrementing `bitPointer` by 4. + while (bitPointer < bitsCount) { + for (let i = 0; i < sortedSuffixes.length; i++) { + const rank = getRank(sortedSuffixes[i]); + const bucketIndex = (rank >> bitPointer) & 0b1111; + buckets[bucketIndex].push(sortedSuffixes[i]); + } + + sortedSuffixes.length = 0; + for (const bucket of buckets) { + sortedSuffixes.push(...bucket); + bucket.length = 0; + } + bitPointer += 4; + } + + return sortedSuffixes; +}; + +function recalculateRanks( + currentRanks: number[], + nextRanks: number[], + suffixArray: number[], + comparisonSpan: number, + length: number +): void { + let rank = 1; + nextRanks[suffixArray[0]] = rank; + + for (let i = 1; i < length; i++) { + const currentSuffix = suffixArray[i]; + const previousSuffix = suffixArray[i - 1]; + + const differentRanks = + currentRanks[currentSuffix] !== currentRanks[previousSuffix]; + const differentNextRanks = + currentRanks[(currentSuffix + comparisonSpan) % length] !== + currentRanks[(previousSuffix + comparisonSpan) % length]; + + if (differentRanks || differentNextRanks) { + rank++; + } + nextRanks[currentSuffix] = rank; + } +}