diff --git a/.github/workflows/build-hf-space.yml b/.github/workflows/build-hf-space.yml
index 8206c3d..51dc2a6 100644
--- a/.github/workflows/build-hf-space.yml
+++ b/.github/workflows/build-hf-space.yml
@@ -25,7 +25,7 @@ jobs:
- name: Setup Node.js
uses: actions/setup-node@v4
with:
- node-version: '18'
+ node-version: '22'
- name: Build Hugging Face space
shell: bash
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..2450a57
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,80 @@
+name: CI
+
+on:
+ push:
+ workflow_dispatch:
+
+concurrency:
+ group: ci-${{ github.event.pull_request.number || github.ref }}
+ cancel-in-progress: true
+
+env:
+ PLAYWRIGHT_BROWSERS_PATH: ${{ github.workspace }}/.cache/ms-playwright
+
+jobs:
+ test:
+ runs-on: ${{ matrix.os }}
+
+ timeout-minutes: 10
+
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ node_version: [22]
+ # include:
+ # - os: macos-14
+ # node_version: 22
+ # - os: windows-latest
+ # node_version: 22
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Set node version to ${{ inputs.node-version }}
+ uses: actions/setup-node@v4
+ with:
+ node-version: ${{ inputs.node-version }}
+
+ - name: Install
+ run: npm ci --include=dev
+
+ - name: Install Playwright Dependencies
+ run: npx playwright install --with-deps
+
+ - name: Build
+ run: npm run build
+
+ - name: Test (Chrome)
+ run: npm run test
+
+ - name: Test (Firefox)
+ run: npm run test:firefox
+
+ lint:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Setup NodeJS
+ uses: actions/setup-node@v4
+ with:
+ node-version: '22'
+
+ - name: Install
+ run: npm ci --include=dev
+
+ - name: Check format
+ run: |
+ git config --global --add safe.directory $(realpath .)
+ git status
+ npm run format
+ git status
+ modified_files="$(git status -s)"
+ echo "Modified files: ${modified_files}"
+ if [ -n "${modified_files}" ]; then
+ echo "Detect unformatted files"
+ echo "You may need to run: npm run format"
+ echo "${modified_files}"
+ exit 1
+ fi
\ No newline at end of file
diff --git a/.github/workflows/generate-docs.yml b/.github/workflows/generate-docs.yml
index 8d366ea..a333b0e 100644
--- a/.github/workflows/generate-docs.yml
+++ b/.github/workflows/generate-docs.yml
@@ -1,9 +1,8 @@
-name: Deploy docs and demo to GitHub Pages
+name: Build docs and demo
on:
- # Runs on pushes targeting the default branch
+ # Runs on pushes
push:
- branches: ["master"]
# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:
@@ -31,7 +30,7 @@ jobs:
- name: Setup Node.js
uses: actions/setup-node@v4
with:
- node-version: '18'
+ node-version: '22'
- name: Install Dependencies
run: npm ci
@@ -53,10 +52,12 @@ jobs:
rm -rf node_modules
- name: Upload artifact
+ if: github.ref == 'refs/heads/master'
uses: actions/upload-pages-artifact@v3
with:
path: "./"
- name: Deploy to GitHub Pages
id: deployment
+ if: github.ref == 'refs/heads/master'
uses: actions/deploy-pages@v4
diff --git a/.github/workflows/verify-generated-code.yml b/.github/workflows/verify-generated-code.yml
new file mode 100644
index 0000000..9e01118
--- /dev/null
+++ b/.github/workflows/verify-generated-code.yml
@@ -0,0 +1,34 @@
+name: Verify generated worker code is up-to-date
+
+on:
+ # Runs on pushes
+ push:
+ # Allows you to run this workflow manually from the Actions tab
+ workflow_dispatch:
+
+jobs:
+ verify:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v4
+
+ - name: Setup Node.js
+ uses: actions/setup-node@v4
+ with:
+ node-version: '22'
+
+ - name: Verify generated code
+ run: |
+ git config --global --add safe.directory $(realpath .)
+ git status
+ npm run build:worker
+ git status
+ modified_files="$(git status -s)"
+ echo "Modified files: ${modified_files}"
+ if [ -n "${modified_files}" ]; then
+ echo "Generated code file is not up-to-date"
+ echo "Hint: You may need to run: npm run build:worker"
+ echo "${modified_files}"
+ exit 1
+ fi
diff --git a/.gitignore b/.gitignore
index f404f0f..b85e4c2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
node_modules
.DS_Store
+.vscode
/cache
/docs
/dist
diff --git a/.prettierignore b/.prettierignore
index bdff196..69c7abe 100644
--- a/.prettierignore
+++ b/.prettierignore
@@ -20,6 +20,8 @@
/src/multi-thread
/src/single-thread
+/src/workers-code/generated.ts
+/src/wasm-from-cdn.ts
*.md
*.mdx
diff --git a/.vscode/settings.json b/.vscode/settings.json
deleted file mode 100644
index b3e9cd1..0000000
--- a/.vscode/settings.json
+++ /dev/null
@@ -1,5 +0,0 @@
-{
- "files.associations": {
- "typeinfo": "cpp"
- }
-}
\ No newline at end of file
diff --git a/README.md b/README.md
index 1b46b8c..726b26e 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,9 @@ WebAssembly binding for [llama.cpp](https://github.com/ggerganov/llama.cpp)
For changelog, please visit [releases page](https://github.com/ngxson/wllama/releases)
+> [!IMPORTANT]
+> Version 2.0 is released 👉 [read more](./guides/intro-v2.md)
+
![](./assets/screenshot_0.png)
## Features
@@ -35,8 +38,8 @@ Limitations:
Demo:
- Basic usages with completions and embeddings: https://github.ngxson.com/wllama/examples/basic/
-- Advanced example using low-level API: https://github.ngxson.com/wllama/examples/advanced/
- Embedding and cosine distance: https://github.ngxson.com/wllama/examples/embeddings/
+- For more advanced example using low-level API, have a look at test file: [wllama.test.ts](./src/wllama.test.ts)
## How to use
@@ -48,7 +51,15 @@ Install it:
npm i @wllama/wllama
```
-For complete code, see [examples/reactjs](./examples/reactjs)
+Then, import the module:
+
+```ts
+import { Wllama } from '@wllama/wllama';
+let wllamaInstance = new Wllama(WLLAMA_CONFIG_PATHS, ...);
+// (the rest is the same with earlier example)
+```
+
+For complete code example, see [examples/main/utils/wllama.context.tsx](./examples/main/utils/wllama.context.tsx)
NOTE: this example only covers completions usage. For embeddings, please see [examples/embeddings/index.html](./examples/embeddings/index.html)
@@ -67,11 +78,8 @@ import { Wllama } from './esm/index.js';
(async () => {
const CONFIG_PATHS = {
- 'single-thread/wllama.js' : './esm/single-thread/wllama.js',
- 'single-thread/wllama.wasm' : './esm/single-thread/wllama.wasm',
- 'multi-thread/wllama.js' : './esm/multi-thread/wllama.js',
- 'multi-thread/wllama.wasm' : './esm/multi-thread/wllama.wasm',
- 'multi-thread/wllama.worker.mjs': './esm/multi-thread/wllama.worker.mjs',
+ 'single-thread/wllama.wasm': './esm/single-thread/wllama.wasm',
+ 'multi-thread/wllama.wasm' : './esm/multi-thread/wllama.wasm',
};
// Automatically switch between single-thread and multi-thread version based on browser support
// If you want to enforce single-thread, add { "n_threads": 1 } to LoadModelConfig
@@ -83,8 +91,11 @@ import { Wllama } from './esm/index.js';
// Log the progress in a user-friendly format
console.log(`Downloading... ${progressPercentage}%`);
};
- await wllama.loadModelFromUrl(
- "https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K.gguf",
+ // Load GGUF from Hugging Face hub
+ // (alternatively, you can use loadModelFromUrl if the model is not from HF hub)
+ await wllama.loadModelFromHF(
+ 'ggml-org/models',
+ 'tinyllamas/stories260K.gguf',
{
progressCallback,
}
@@ -101,6 +112,14 @@ import { Wllama } from './esm/index.js';
})();
```
+Alternatively, you can use the `*.wasm` files from CDN:
+
+```js
+import WasmFromCDN from '@wllama/wllama/esm/wasm-from-cdn.js';
+const wllama = new Wllama(WasmFromCDN);
+// NOTE: this is not recommended, only use when you can't embed wasm files in your project
+```
+
### Split model
Cases where we want to split the model:
@@ -116,14 +135,15 @@ We use `llama-gguf-split` to split a big gguf file into smaller files. You can d
This will output files ending with `-00001-of-00003.gguf`, `-00002-of-00003.gguf`, and so on.
-You can then pass to `loadModelFromUrl` the URL of the first file and it will automatically load all the chunks:
+You can then pass to `loadModelFromUrl` or `loadModelFromHF` the URL of the first file and it will automatically load all the chunks:
```js
-await wllama.loadModelFromUrl(
- 'https://huggingface.co/ngxson/tinyllama_split_test/resolve/main/stories15M-q8_0-00001-of-00003.gguf',
- {
- parallelDownloads: 5, // optional: maximum files to download in parallel (default: 3)
- },
+const wllama = new Wllama(CONFIG_PATHS, {
+ parallelDownloads: 5, // optional: maximum files to download in parallel (default: 3)
+});
+await wllama.loadModelFromHF(
+ 'ngxson/tinyllama_split_test',
+ 'stories15M-q8_0-00001-of-00003.gguf'
);
```
@@ -184,11 +204,7 @@ npm run build
## TODO
-Short term:
-- Add a more pratical embedding example (using a better model)
-- Maybe doing a full RAG-in-browser example using tinyllama?
-
-Long term:
+- Add support for LoRA adapter
- Support GPU inference via WebGL
- Support multi-sequences: knowing the resource limitation when using WASM, I don't think having multi-sequences is a good idea
- Multi-modal: Waiting for refactoring LLaVA implementation from llama.cpp
diff --git a/actions.hpp b/actions.hpp
index f00274c..fd2a537 100644
--- a/actions.hpp
+++ b/actions.hpp
@@ -287,7 +287,7 @@ json action_set_options(app_t &app, json &body)
json action_sampling_init(app_t &app, json &body)
{
// sampling
- common_sampler_params sparams;
+ common_params_sampling sparams;
sparams.seed = app.seed;
if (sparams.seed == LLAMA_DEFAULT_SEED)
sparams.seed = time(NULL);
diff --git a/examples/advanced/index.html b/examples/advanced/index.html
deleted file mode 100644
index 66e0ca9..0000000
--- a/examples/advanced/index.html
+++ /dev/null
@@ -1,197 +0,0 @@
-
-
-
-
-
- wllama.cpp demo
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/examples/basic/index.html b/examples/basic/index.html
index d6e900d..755de09 100644
--- a/examples/basic/index.html
+++ b/examples/basic/index.html
@@ -50,11 +50,8 @@ Embeddings
import { Wllama } from '../../esm/index.js';
const CONFIG_PATHS = {
- 'single-thread/wllama.js' : '../../esm/single-thread/wllama.js',
- 'single-thread/wllama.wasm' : '../../esm/single-thread/wllama.wasm',
- 'multi-thread/wllama.js' : '../../esm/multi-thread/wllama.js',
- 'multi-thread/wllama.wasm' : '../../esm/multi-thread/wllama.wasm',
- 'multi-thread/wllama.worker.mjs': '../../esm/multi-thread/wllama.worker.mjs',
+ 'single-thread/wllama.wasm': '../../esm/single-thread/wllama.wasm',
+ 'multi-thread/wllama.wasm' : '../../esm/multi-thread/wllama.wasm',
};
const CMPL_MODEL = 'https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories15M-q4_0.gguf';
const CMPL_MODEL_SIZE = '19MB';
diff --git a/examples/embeddings/index.html b/examples/embeddings/index.html
index a349523..6b0b9f1 100644
--- a/examples/embeddings/index.html
+++ b/examples/embeddings/index.html
@@ -22,11 +22,8 @@
import { Wllama } from '../../esm/index.js';
const CONFIG_PATHS = {
- 'single-thread/wllama.js' : '../../esm/single-thread/wllama.js',
- 'single-thread/wllama.wasm' : '../../esm/single-thread/wllama.wasm',
- 'multi-thread/wllama.js' : '../../esm/multi-thread/wllama.js',
- 'multi-thread/wllama.wasm' : '../../esm/multi-thread/wllama.wasm',
- 'multi-thread/wllama.worker.mjs': '../../esm/multi-thread/wllama.worker.mjs',
+ 'single-thread/wllama.wasm': '../../esm/single-thread/wllama.wasm',
+ 'multi-thread/wllama.wasm' : '../../esm/multi-thread/wllama.wasm',
};
const MODEL = 'https://huggingface.co/CompendiumLabs/bge-base-en-v1.5-gguf/resolve/main/bge-base-en-v1.5-q4_k_m.gguf';
const TEXTS = [
diff --git a/examples/main/src/components/ChatScreen.tsx b/examples/main/src/components/ChatScreen.tsx
index 27d7a31..ebe208f 100644
--- a/examples/main/src/components/ChatScreen.tsx
+++ b/examples/main/src/components/ChatScreen.tsx
@@ -16,7 +16,7 @@ export default function ChatScreen() {
isGenerating,
createCompletion,
navigateTo,
- currModel,
+ loadedModel,
getWllamaInstance,
stopCompletion,
} = useWllama();
@@ -64,8 +64,8 @@ export default function ChatScreen() {
}
// generate response
- if (!currModel) {
- throw new Error('currModel is null');
+ if (!loadedModel) {
+ throw new Error('loadedModel is null');
}
const formattedChat = await formatChat(getWllamaInstance(), [
...currHistory,
@@ -118,7 +118,7 @@ export default function ChatScreen() {
)}
- {currModel && (
+ {loadedModel && (
)}
- {!currModel && }
+ {!loadedModel && }
wllama may generate inaccurate information. Use with your own risk.
diff --git a/examples/main/src/components/ModelScreen.tsx b/examples/main/src/components/ModelScreen.tsx
index 1779c34..73bbdc6 100644
--- a/examples/main/src/components/ModelScreen.tsx
+++ b/examples/main/src/components/ModelScreen.tsx
@@ -1,4 +1,4 @@
-import { ManageModel, ModelState, Screen } from '../utils/types';
+import { ModelState, Screen } from '../utils/types';
import { useWllama } from '../utils/wllama.context';
import { FontAwesomeIcon } from '@fortawesome/react-fontawesome';
import {
@@ -8,23 +8,24 @@ import {
faCheck,
} from '@fortawesome/free-solid-svg-icons';
import { DEFAULT_INFERENCE_PARAMS, MAX_GGUF_SIZE } from '../config';
-import { toHumanReadableSize } from '../utils/utils';
-import { useState } from 'react';
+import { toHumanReadableSize, useDebounce } from '../utils/utils';
+import { useEffect, useState } from 'react';
import ScreenWrapper from './ScreenWrapper';
+import { DisplayedModel } from '../utils/displayed-model';
export default function ModelScreen() {
const [showAddCustom, setShowAddCustom] = useState(false);
const {
models,
- removeModel,
+ removeCachedModel,
isLoadingModel,
isDownloading,
- currModel,
+ loadedModel,
currParams,
setParams,
} = useWllama();
- const blockModelBtn = !!(currModel || isDownloading || isLoadingModel);
+ const blockModelBtn = !!(loadedModel || isDownloading || isLoadingModel);
const onChange = (key: keyof typeof currParams) => (e: any) => {
setParams({ ...currParams, [key]: parseFloat(e.target.value || -1) });
@@ -101,7 +102,7 @@ export default function ModelScreen() {
)
) {
for (const m of models) {
- await removeModel(m);
+ await removeCachedModel(m);
}
}
}}
@@ -123,7 +124,7 @@ export default function ModelScreen() {
{models
- .filter((m) => m.userAdded)
+ .filter((m) => m.isUserAdded)
.map((m) => (
))}
@@ -133,7 +134,7 @@ export default function ModelScreen() {
Recommended models
{models
- .filter((m) => !m.userAdded)
+ .filter((m) => !m.isUserAdded)
.map((m) => (
))}
@@ -150,12 +151,58 @@ export default function ModelScreen() {
function AddCustomModelDialog({ onClose }: { onClose(): void }) {
const { isLoadingModel, addCustomModel } = useWllama();
- const [url, setUrl] = useState('');
+ const [hfRepo, setHfRepo] = useState('');
+ const [hfFile, setHfFile] = useState('');
+ const [hfFiles, setHfFiles] = useState([]);
+ const [abortSignal, setAbortSignal] = useState(
+ new AbortController()
+ );
const [err, setErr] = useState();
+ useDebounce(
+ async () => {
+ if (hfRepo.length < 2) {
+ setHfFiles([]);
+ return;
+ }
+ try {
+ const res = await fetch(`https://huggingface.co/api/models/${hfRepo}`, {
+ signal: abortSignal.signal,
+ });
+ const data: { siblings?: { rfilename: string }[] } = await res.json();
+ if (data.siblings) {
+ setHfFiles(
+ data.siblings
+ .map((s) => s.rfilename)
+ .filter((f) => f.endsWith('.gguf'))
+ );
+ setErr('');
+ } else {
+ setErr('no model found or it is private');
+ setHfFiles([]);
+ }
+ } catch (e) {
+ if ((e as Error).name !== 'AbortError') {
+ setErr((e as any)?.message ?? 'unknown error');
+ setHfFiles([]);
+ }
+ }
+ },
+ [hfRepo],
+ 500
+ );
+
+ useEffect(() => {
+ if (hfFiles.length === 0) {
+ setHfFile('');
+ }
+ }, [hfFiles]);
+
const onSubmit = async () => {
try {
- await addCustomModel(url);
+ await addCustomModel(
+ `https://huggingface.co/${hfRepo}/resolve/main/${hfFile}`
+ );
onClose();
} catch (e) {
setErr((e as any)?.message ?? 'unknown error');
@@ -180,21 +227,36 @@ function AddCustomModelDialog({ onClose }: { onClose(): void }) {
+
{err && Error: {err}
}