Skip to content

Commit

Permalink
refactor: bwt encoding optimized, comments updated in bwtDecode.ts
Browse files Browse the repository at this point in the history
  • Loading branch information
petrlipatov committed Sep 6, 2024
1 parent c719503 commit 5707da1
Show file tree
Hide file tree
Showing 3 changed files with 144 additions and 21 deletions.
3 changes: 1 addition & 2 deletions src/algorithms/bwt/bwtDecode.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,11 @@ import { END_OF_STRING } from "./utils/constants";
/**
* Decodes a string that was encoded using the Burrows-Wheeler Transform (BWT).
*
* 1. Create a createFrequencyMap for chars from input string.
*
* 2. Construct a mapping (`firstOccurrenceTable`) that indicates the starting position of each character in the sorted BWT string.
*
* 3. Compute the index of each character in the BWT string as it appears in the sorted BWT string.
* 3. Compute the index of each character in the BWT string as it appears in the sorted BWT string.
*
* 4. Using the `charsIndexesInSortedInput` array, trace back from the given index to reconstruct the original string.
*
Expand Down
40 changes: 21 additions & 19 deletions src/algorithms/bwt/bwtEncode.ts
Original file line number Diff line number Diff line change
@@ -1,39 +1,41 @@
import { END_OF_STRING } from "./utils/constants";
import { buildSuffixArray } from "./utils/suffix-array";

/**
* Encodes a given string using the Burrows-Wheeler Transform (BWT).
*
* The function virtualizes all cyclic permutations of the input string, sorts them
* lexicographically, and constructs the BWT string from the last characters of each
* permutation. It also returns the index of the original string in the sorted list of permutations.
* This function constructs the suffix array for the input string using a
* prefix doubling algorithm with radix sort, which operates in O(n log n) time.
* The BWT string is then derived from the suffix array by taking the character
* preceding each suffix.
*
* @param {string} input - The string to be encoded using the Burrows-Wheeler Transform.
* @param {string} input - The string to be encoded using the BWT.
* @returns {{ bwt: string; index: number }} An object containing the encoded BWT string and the index of the original string.
*/
export function bwtEncode(input: string): { bwt: string; index: number } {
input += END_OF_STRING;
const length = input.length;

const permutationsIndexes = Array.from({ length }, (_, i) => i);

// sorting all permutations
// using indexes
// without creating actual strings
permutationsIndexes.sort((a, b) => {
for (let i = 0; i < length; i++) {
const charA = input[(a + i) % length];
const charB = input[(b + i) % length];
if (charA < charB) return -1;
if (charA > charB) return 1;
}
return 0;
});
// To construct the BWT string, we utilize the suffix array.
// The suffix array contains the starting indixes of all suffixes of the string, sorted in lexicographical order.
// This sorted order helps us build the BWT string by accessing the characters just before the start of each suffix
// in the cyclic permutation of the string.
const suffixArray = buildSuffixArray(input);

const bwtResult: string[] = [];
let originalIndex = -1;

// For each suffix indexed in the suffix array, we find the character that precedes it in this cyclic view.
// The formula (index + length - 1) % length computes the position of this preceding character:
// - `index` is the start of the current suffix in the sorted suffix array.
// - Add length - 1 to the index to move to the preceding position in the cyclic permutation.
// - Use modulo length to wrap around to the start of the string if needed.
// Example: For a suffix starting at index 5 in a string of length 6:
// - Calculate the preceding character position: (5 + 6 - 1) % 6 = 10 % 6 = 4
// - The character at index 4 in the string is the one just before the suffix starting at index 5.
// Collecting these preceding characters for all suffixes in the sorted order gives us the BWT string.
for (let i = 0; i < length; i++) {
const index = permutationsIndexes[i];
const index = suffixArray[i];
bwtResult.push(input[(index + length - 1) % length]);
if (index === 0) {
originalIndex = i;
Expand Down
122 changes: 122 additions & 0 deletions src/algorithms/bwt/utils/suffix-array.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
export function buildSuffixArray(input: string): number[] {
const length = input.length;
// The `ranks` array holds the `rank` of each suffix.
// As comparisonRange increases, these ranks begin to represent the order of suffixes based
// on multiple characters.
//
// Example for the string "banana":
// Initially, `ranks` contains ASCII codes of the characters:
// String: "banana"
// Indexes: 0 1 2 3 4 5
// ranks: [98, 97, 110, 97, 110, 97]
//
// After the first iteration (`comparisonSpan = 1`), ranks get updated:
// Now `ranks` represents the relative order of suffixes based on the first character
// (and the second character, if available): [1, 0, 2, 0, 2, 0]
let ranks = new Array(length);
// `nextRanks` is a temp array where new ranks are stored after recalculating them during
// each iteration. After updating, these ranks will be copied back to ranks for the next iteration.
let nextRanks = new Array(length);
// `comparisonRange` controls how many characters we compare in each iteration. Initially, it’s set to 1
// (compare only the first character), then it doubles (compare first two characters,
// then four, and so on) until all characters in the suffix are considered.
let comparisonRange = 1;

// indexes of sorted suffixes
const sortedSuffixes = new Array(length);

// set initial values for `ranks` and `orderedSuffixes`
for (let i = 0; i < length; i++) {
ranks[i] = input.charCodeAt(i);
sortedSuffixes[i] = i;
}

while (comparisonRange < length) {
// Step 1: Sort suffixes based on ranks starting `comparisonRange` positions ahead.
// Example: Suppose `comparisonRange` = 1.
// - `ranks` = [98, 97, 110, 97, 110, 97] (character codes for suffixes).
// - Suffixes are sorted based on ranks of positions one ahead:
// - For suffix starting at index 0, rank is ranks[(0 + 1) % 6] = ranks[1] = 97.
// - For suffix starting at index 1, rank is ranks[(1 + 1) % 6] = ranks[2] = 110.
// - Continue similarly for all suffixes.
// - Sorting the suffix array based on these "ahead" ranks gives us a new order.
radixSort(sortedSuffixes, (i) => ranks[(i + comparisonRange) % length]);
// Step 2: Sort suffixes based on current ranks.
// Example: Suppose after the previous sorting, the ranks are as follows:
// - `ranks` = [1, 0, 2, 0, 2, 0] (ranks of suffixes based on initial comparison).
// - Sorting `suffixArray` directly by these ranks would order suffixes based on current rank values.
radixSort(sortedSuffixes, (i) => ranks[i]);
recalculateRanks(ranks, nextRanks, sortedSuffixes, comparisonRange, length);

[ranks, nextRanks] = [nextRanks, ranks];
comparisonRange *= 2;
}

return sortedSuffixes;
}

/**
* Sorts the `suffixIndexes` array using radix sort based on the ranks of suffixes.
* @param suffixIndexes - An array of suffix indices to be sorted.
* @param getRank - A function that returns the rank of a given suffix index.
*/
const radixSort = (
sortedSuffixes: number[],
getRank: (index: number) => number
) => {
const buckets: number[][] = Array.from({ length: 16 }, () => []);

let bitPointer = 0;
let maxValue = Math.max(...sortedSuffixes.map(getRank));
const bitsCount = Math.ceil(Math.log2(maxValue + 1));

// We sort suffix indexes using radix sort, processing the rank bits in chunks of 4 bits.
// 1. Loop over `suffixIndexes` and determine the rank for each suffix.
// 2. Extract 4 bits from the rank, starting from `bitPointer`.
// 3. Use these 4 bits to decide the bucket where the suffix index will go.
// 4. After distributing all suffixes into buckets based on the current 4-bit segment,
// move to the next 4-bit segment by incrementing `bitPointer` by 4.
while (bitPointer < bitsCount) {
for (let i = 0; i < sortedSuffixes.length; i++) {
const rank = getRank(sortedSuffixes[i]);
const bucketIndex = (rank >> bitPointer) & 0b1111;
buckets[bucketIndex].push(sortedSuffixes[i]);
}

sortedSuffixes.length = 0;
for (const bucket of buckets) {
sortedSuffixes.push(...bucket);
bucket.length = 0;
}
bitPointer += 4;
}

return sortedSuffixes;
};

function recalculateRanks(
currentRanks: number[],
nextRanks: number[],
suffixArray: number[],
comparisonSpan: number,
length: number
): void {
let rank = 1;
nextRanks[suffixArray[0]] = rank;

for (let i = 1; i < length; i++) {
const currentSuffix = suffixArray[i];
const previousSuffix = suffixArray[i - 1];

const differentRanks =
currentRanks[currentSuffix] !== currentRanks[previousSuffix];
const differentNextRanks =
currentRanks[(currentSuffix + comparisonSpan) % length] !==
currentRanks[(previousSuffix + comparisonSpan) % length];

if (differentRanks || differentNextRanks) {
rank++;
}
nextRanks[currentSuffix] = rank;
}
}

0 comments on commit 5707da1

Please sign in to comment.