diff --git a/.github/workflows/build-hf-space.yml b/.github/workflows/build-hf-space.yml index 8206c3d..51dc2a6 100644 --- a/.github/workflows/build-hf-space.yml +++ b/.github/workflows/build-hf-space.yml @@ -25,7 +25,7 @@ jobs: - name: Setup Node.js uses: actions/setup-node@v4 with: - node-version: '18' + node-version: '22' - name: Build Hugging Face space shell: bash diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..2450a57 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,80 @@ +name: CI + +on: + push: + workflow_dispatch: + +concurrency: + group: ci-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + PLAYWRIGHT_BROWSERS_PATH: ${{ github.workspace }}/.cache/ms-playwright + +jobs: + test: + runs-on: ${{ matrix.os }} + + timeout-minutes: 10 + + strategy: + matrix: + os: [ubuntu-latest] + node_version: [22] + # include: + # - os: macos-14 + # node_version: 22 + # - os: windows-latest + # node_version: 22 + fail-fast: false + + steps: + - uses: actions/checkout@v4 + + - name: Set node version to ${{ inputs.node-version }} + uses: actions/setup-node@v4 + with: + node-version: ${{ inputs.node-version }} + + - name: Install + run: npm ci --include=dev + + - name: Install Playwright Dependencies + run: npx playwright install --with-deps + + - name: Build + run: npm run build + + - name: Test (Chrome) + run: npm run test + + - name: Test (Firefox) + run: npm run test:firefox + + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Setup NodeJS + uses: actions/setup-node@v4 + with: + node-version: '22' + + - name: Install + run: npm ci --include=dev + + - name: Check format + run: | + git config --global --add safe.directory $(realpath .) + git status + npm run format + git status + modified_files="$(git status -s)" + echo "Modified files: ${modified_files}" + if [ -n "${modified_files}" ]; then + echo "Detect unformatted files" + echo "You may need to run: npm run format" + echo "${modified_files}" + exit 1 + fi \ No newline at end of file diff --git a/.github/workflows/generate-docs.yml b/.github/workflows/generate-docs.yml index 8d366ea..a333b0e 100644 --- a/.github/workflows/generate-docs.yml +++ b/.github/workflows/generate-docs.yml @@ -1,9 +1,8 @@ -name: Deploy docs and demo to GitHub Pages +name: Build docs and demo on: - # Runs on pushes targeting the default branch + # Runs on pushes push: - branches: ["master"] # Allows you to run this workflow manually from the Actions tab workflow_dispatch: @@ -31,7 +30,7 @@ jobs: - name: Setup Node.js uses: actions/setup-node@v4 with: - node-version: '18' + node-version: '22' - name: Install Dependencies run: npm ci @@ -53,10 +52,12 @@ jobs: rm -rf node_modules - name: Upload artifact + if: github.ref == 'refs/heads/master' uses: actions/upload-pages-artifact@v3 with: path: "./" - name: Deploy to GitHub Pages id: deployment + if: github.ref == 'refs/heads/master' uses: actions/deploy-pages@v4 diff --git a/.github/workflows/verify-generated-code.yml b/.github/workflows/verify-generated-code.yml new file mode 100644 index 0000000..9e01118 --- /dev/null +++ b/.github/workflows/verify-generated-code.yml @@ -0,0 +1,34 @@ +name: Verify generated worker code is up-to-date + +on: + # Runs on pushes + push: + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +jobs: + verify: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: '22' + + - name: Verify generated code + run: | + git config --global --add safe.directory $(realpath .) + git status + npm run build:worker + git status + modified_files="$(git status -s)" + echo "Modified files: ${modified_files}" + if [ -n "${modified_files}" ]; then + echo "Generated code file is not up-to-date" + echo "Hint: You may need to run: npm run build:worker" + echo "${modified_files}" + exit 1 + fi diff --git a/.gitignore b/.gitignore index f404f0f..b85e4c2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ node_modules .DS_Store +.vscode /cache /docs /dist diff --git a/.prettierignore b/.prettierignore index bdff196..69c7abe 100644 --- a/.prettierignore +++ b/.prettierignore @@ -20,6 +20,8 @@ /src/multi-thread /src/single-thread +/src/workers-code/generated.ts +/src/wasm-from-cdn.ts *.md *.mdx diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index b3e9cd1..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "files.associations": { - "typeinfo": "cpp" - } -} \ No newline at end of file diff --git a/README.md b/README.md index 1b46b8c..726b26e 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,9 @@ WebAssembly binding for [llama.cpp](https://github.com/ggerganov/llama.cpp) For changelog, please visit [releases page](https://github.com/ngxson/wllama/releases) +> [!IMPORTANT] +> Version 2.0 is released 👉 [read more](./guides/intro-v2.md) + ![](./assets/screenshot_0.png) ## Features @@ -35,8 +38,8 @@ Limitations: Demo: - Basic usages with completions and embeddings: https://github.ngxson.com/wllama/examples/basic/ -- Advanced example using low-level API: https://github.ngxson.com/wllama/examples/advanced/ - Embedding and cosine distance: https://github.ngxson.com/wllama/examples/embeddings/ +- For more advanced example using low-level API, have a look at test file: [wllama.test.ts](./src/wllama.test.ts) ## How to use @@ -48,7 +51,15 @@ Install it: npm i @wllama/wllama ``` -For complete code, see [examples/reactjs](./examples/reactjs) +Then, import the module: + +```ts +import { Wllama } from '@wllama/wllama'; +let wllamaInstance = new Wllama(WLLAMA_CONFIG_PATHS, ...); +// (the rest is the same with earlier example) +``` + +For complete code example, see [examples/main/utils/wllama.context.tsx](./examples/main/utils/wllama.context.tsx) NOTE: this example only covers completions usage. For embeddings, please see [examples/embeddings/index.html](./examples/embeddings/index.html) @@ -67,11 +78,8 @@ import { Wllama } from './esm/index.js'; (async () => { const CONFIG_PATHS = { - 'single-thread/wllama.js' : './esm/single-thread/wllama.js', - 'single-thread/wllama.wasm' : './esm/single-thread/wllama.wasm', - 'multi-thread/wllama.js' : './esm/multi-thread/wllama.js', - 'multi-thread/wllama.wasm' : './esm/multi-thread/wllama.wasm', - 'multi-thread/wllama.worker.mjs': './esm/multi-thread/wllama.worker.mjs', + 'single-thread/wllama.wasm': './esm/single-thread/wllama.wasm', + 'multi-thread/wllama.wasm' : './esm/multi-thread/wllama.wasm', }; // Automatically switch between single-thread and multi-thread version based on browser support // If you want to enforce single-thread, add { "n_threads": 1 } to LoadModelConfig @@ -83,8 +91,11 @@ import { Wllama } from './esm/index.js'; // Log the progress in a user-friendly format console.log(`Downloading... ${progressPercentage}%`); }; - await wllama.loadModelFromUrl( - "https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K.gguf", + // Load GGUF from Hugging Face hub + // (alternatively, you can use loadModelFromUrl if the model is not from HF hub) + await wllama.loadModelFromHF( + 'ggml-org/models', + 'tinyllamas/stories260K.gguf', { progressCallback, } @@ -101,6 +112,14 @@ import { Wllama } from './esm/index.js'; })(); ``` +Alternatively, you can use the `*.wasm` files from CDN: + +```js +import WasmFromCDN from '@wllama/wllama/esm/wasm-from-cdn.js'; +const wllama = new Wllama(WasmFromCDN); +// NOTE: this is not recommended, only use when you can't embed wasm files in your project +``` + ### Split model Cases where we want to split the model: @@ -116,14 +135,15 @@ We use `llama-gguf-split` to split a big gguf file into smaller files. You can d This will output files ending with `-00001-of-00003.gguf`, `-00002-of-00003.gguf`, and so on. -You can then pass to `loadModelFromUrl` the URL of the first file and it will automatically load all the chunks: +You can then pass to `loadModelFromUrl` or `loadModelFromHF` the URL of the first file and it will automatically load all the chunks: ```js -await wllama.loadModelFromUrl( - 'https://huggingface.co/ngxson/tinyllama_split_test/resolve/main/stories15M-q8_0-00001-of-00003.gguf', - { - parallelDownloads: 5, // optional: maximum files to download in parallel (default: 3) - }, +const wllama = new Wllama(CONFIG_PATHS, { + parallelDownloads: 5, // optional: maximum files to download in parallel (default: 3) +}); +await wllama.loadModelFromHF( + 'ngxson/tinyllama_split_test', + 'stories15M-q8_0-00001-of-00003.gguf' ); ``` @@ -184,11 +204,7 @@ npm run build ## TODO -Short term: -- Add a more pratical embedding example (using a better model) -- Maybe doing a full RAG-in-browser example using tinyllama? - -Long term: +- Add support for LoRA adapter - Support GPU inference via WebGL - Support multi-sequences: knowing the resource limitation when using WASM, I don't think having multi-sequences is a good idea - Multi-modal: Waiting for refactoring LLaVA implementation from llama.cpp diff --git a/actions.hpp b/actions.hpp index f00274c..fd2a537 100644 --- a/actions.hpp +++ b/actions.hpp @@ -287,7 +287,7 @@ json action_set_options(app_t &app, json &body) json action_sampling_init(app_t &app, json &body) { // sampling - common_sampler_params sparams; + common_params_sampling sparams; sparams.seed = app.seed; if (sparams.seed == LLAMA_DEFAULT_SEED) sparams.seed = time(NULL); diff --git a/examples/advanced/index.html b/examples/advanced/index.html deleted file mode 100644 index 66e0ca9..0000000 --- a/examples/advanced/index.html +++ /dev/null @@ -1,197 +0,0 @@ - - - - - - wllama.cpp demo - - - - - -
- - - - \ No newline at end of file diff --git a/examples/basic/index.html b/examples/basic/index.html index d6e900d..755de09 100644 --- a/examples/basic/index.html +++ b/examples/basic/index.html @@ -50,11 +50,8 @@

Embeddings

import { Wllama } from '../../esm/index.js'; const CONFIG_PATHS = { - 'single-thread/wllama.js' : '../../esm/single-thread/wllama.js', - 'single-thread/wllama.wasm' : '../../esm/single-thread/wllama.wasm', - 'multi-thread/wllama.js' : '../../esm/multi-thread/wllama.js', - 'multi-thread/wllama.wasm' : '../../esm/multi-thread/wllama.wasm', - 'multi-thread/wllama.worker.mjs': '../../esm/multi-thread/wllama.worker.mjs', + 'single-thread/wllama.wasm': '../../esm/single-thread/wllama.wasm', + 'multi-thread/wllama.wasm' : '../../esm/multi-thread/wllama.wasm', }; const CMPL_MODEL = 'https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories15M-q4_0.gguf'; const CMPL_MODEL_SIZE = '19MB'; diff --git a/examples/embeddings/index.html b/examples/embeddings/index.html index a349523..6b0b9f1 100644 --- a/examples/embeddings/index.html +++ b/examples/embeddings/index.html @@ -22,11 +22,8 @@ import { Wllama } from '../../esm/index.js'; const CONFIG_PATHS = { - 'single-thread/wllama.js' : '../../esm/single-thread/wllama.js', - 'single-thread/wllama.wasm' : '../../esm/single-thread/wllama.wasm', - 'multi-thread/wllama.js' : '../../esm/multi-thread/wllama.js', - 'multi-thread/wllama.wasm' : '../../esm/multi-thread/wllama.wasm', - 'multi-thread/wllama.worker.mjs': '../../esm/multi-thread/wllama.worker.mjs', + 'single-thread/wllama.wasm': '../../esm/single-thread/wllama.wasm', + 'multi-thread/wllama.wasm' : '../../esm/multi-thread/wllama.wasm', }; const MODEL = 'https://huggingface.co/CompendiumLabs/bge-base-en-v1.5-gguf/resolve/main/bge-base-en-v1.5-q4_k_m.gguf'; const TEXTS = [ diff --git a/examples/main/src/components/ChatScreen.tsx b/examples/main/src/components/ChatScreen.tsx index 27d7a31..ebe208f 100644 --- a/examples/main/src/components/ChatScreen.tsx +++ b/examples/main/src/components/ChatScreen.tsx @@ -16,7 +16,7 @@ export default function ChatScreen() { isGenerating, createCompletion, navigateTo, - currModel, + loadedModel, getWllamaInstance, stopCompletion, } = useWllama(); @@ -64,8 +64,8 @@ export default function ChatScreen() { } // generate response - if (!currModel) { - throw new Error('currModel is null'); + if (!loadedModel) { + throw new Error('loadedModel is null'); } const formattedChat = await formatChat(getWllamaInstance(), [ ...currHistory, @@ -118,7 +118,7 @@ export default function ChatScreen() { )} - {currModel && ( + {loadedModel && (