generated from obsidianmd/obsidian-sample-plugin
-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
✨ Add support for languages whose word break is whitespace
- Loading branch information
1 parent
01da01e
commit bc63ac2
Showing
4 changed files
with
138 additions
and
36 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
import TinySegmenter from "./tiny-segmenter"; | ||
import CodeMirror from "codemirror"; | ||
// @ts-ignore | ||
const segmenter = new TinySegmenter(); | ||
|
||
export type TokenizeStrategy = "default" | "japanese"; | ||
|
||
function pickTokens(cmEditor: CodeMirror.Editor): string[] { | ||
const maxLineIndex = cmEditor.getDoc().lineCount(); | ||
return [...Array(maxLineIndex).keys()] | ||
.flatMap((x) => | ||
cmEditor | ||
.getLineTokens(x) | ||
.flatMap((x) => | ||
x.type?.includes("hmd-codeblock") ? x.string.split(" ") : [x.string] | ||
) | ||
) | ||
.map((x) => x.replace(/[\[\]()<>"'.,|; `]/g, "")) | ||
.filter((x) => x !== ""); | ||
} | ||
|
||
function pickTokensAsJapanese(cmEditor: CodeMirror.Editor): string[] { | ||
return cmEditor | ||
.getValue() | ||
.split(`\n`) | ||
.flatMap<string>((x) => segmenter.segment(x)) | ||
.map((x) => x.replace(/[\[\]()<>"'.,|; `]/, "")); | ||
} | ||
|
||
interface TokenizedResult { | ||
currentToken: string; | ||
currentTokenStart: number; | ||
tokens: string[]; | ||
} | ||
|
||
interface Tokenizer { | ||
/** | ||
* Return undefined if current token is empty. | ||
*/ | ||
tokenize(): TokenizedResult | undefined; | ||
} | ||
|
||
class DefaultTokenizer implements Tokenizer { | ||
private readonly cmEditor: CodeMirror.Editor; | ||
|
||
constructor(cmEditor: CodeMirror.Editor) { | ||
this.cmEditor = cmEditor; | ||
} | ||
|
||
tokenize(): TokenizedResult | undefined { | ||
const cursor = this.cmEditor.getCursor(); | ||
const token = this.cmEditor.getTokenAt(cursor); | ||
if (!token.string) { | ||
return undefined; | ||
} | ||
|
||
console.log(pickTokens(this.cmEditor)); | ||
return { | ||
currentToken: token.string, | ||
currentTokenStart: token.start, | ||
tokens: pickTokens(this.cmEditor), | ||
}; | ||
} | ||
} | ||
|
||
class JapaneseTokenizer implements Tokenizer { | ||
private readonly cmEditor: CodeMirror.Editor; | ||
|
||
constructor(cmEditor: CodeMirror.Editor) { | ||
this.cmEditor = cmEditor; | ||
} | ||
|
||
tokenize(): TokenizedResult | undefined { | ||
const cursor = this.cmEditor.getCursor(); | ||
const token = this.cmEditor.getTokenAt(cursor); | ||
if (!token.string) { | ||
return undefined; | ||
} | ||
|
||
const words = segmenter.segment(token.string); | ||
const currentToken = words.pop(); | ||
const currentTokenStart = | ||
token.start + words.reduce((t: number, x: string) => t + x.length, 0); | ||
const tokens = pickTokensAsJapanese(this.cmEditor); | ||
|
||
return { | ||
currentToken, | ||
currentTokenStart, | ||
tokens, | ||
}; | ||
} | ||
} | ||
|
||
export function createTokenizer( | ||
cmEditor: CodeMirror.Editor, | ||
strategy: TokenizeStrategy | ||
): Tokenizer { | ||
switch (strategy) { | ||
case "default": | ||
return new DefaultTokenizer(cmEditor); | ||
case "japanese": | ||
return new JapaneseTokenizer(cmEditor); | ||
default: | ||
throw new Error(`Unexpected strategy name: ${strategy}`); | ||
} | ||
} |