feat: Add MockSentenceTransformer (#5)

* Add MockSentenceTransformer * Fix typos * Disable vulnerability scans
navapbc · Jun 25, 2024 · 6596da7 · 6596da7
1 parent 8e604d1
commit 6596da7
Show file tree

Hide file tree

Showing 9 changed files with 1,086 additions and 14 deletions.
diff --git a/.../workflows/ci-app-vulnerability-scans.yml → ...s/ci-app-vulnerability-scans.yml.disabled b/.../workflows/ci-app-vulnerability-scans.yml → ...s/ci-app-vulnerability-scans.yml.disabled
diff --git a/app/.gitignore b/app/.gitignore
@@ -31,4 +31,7 @@ coverage.*
 poetry-installer-error-*.log
 
 # Chainlit
-.chainlit
+.chainlit
+
+# Cached models
+models/
diff --git a/app/local.env b/app/local.env
@@ -71,3 +71,5 @@ AWS_SECRET_ACCESS_KEY=DO_NOT_SET_HERE
 #AWS_SESSION_TOKEN=DO_NOT_SET_HERE
 
 AWS_DEFAULT_REGION=us-east-1
+
+EMBEDDING_MODEL=/app/models/multi-qa-mpnet-base-dot-v1
diff --git a/app/poetry.lock b/app/poetry.lock
diff --git a/app/pyproject.toml b/app/pyproject.toml
@@ -20,6 +20,7 @@ marshmallow = "^3.20.1"
 psycopg = {extras = ["binary"], version = "^3.1.10"}
 pydantic-settings = "^2.0.3"
 chainlit = "^1.1.304"
+sentence-transformers = "^3.0.1"
 
 [tool.poetry.group.dev.dependencies]
 black = "^23.9.1"

diff --git a/app/src/app_config.py b/app/src/app_config.py
@@ -2,4 +2,4 @@
 
 
 class AppConfig(PydanticBaseEnvConfig):
-    ...
+    embedding_model: str = "multi-qa-mpnet-base-dot-v1"
diff --git a/app/tests/mock/mock_sentence_transformer.py b/app/tests/mock/mock_sentence_transformer.py
@@ -0,0 +1,29 @@
+import math
+
+
+class MockSentenceTransformer:
+    def __init__(self, *args, **kwargs):
+        # Imitate multi-qa-mpnet-base-dot-v1
+        self.max_seq_length = 512
+        self.tokenizer = MockTokenizer()
+
+    def encode(self, text, **kwargs):
+        """
+        Encode text into a 768-dimensional embedding that allows for similarity comparison via the dot product.
+        The embedding represents the average word length of the text
+        """
+
+        tokens = self.tokenizer.tokenize(text)
+        average_token_length = sum(len(token) for token in tokens) / len(tokens)
+
+        # Convert average word length to an angle, and pad the vector to length 768
+        angle = (1 / average_token_length) * 2 * math.pi
+        embedding = [math.cos(angle), math.sin(angle)] + ([0] * 766)
+
+        # Normalize the embedding before returning it
+        return [x / sum(embedding) for x in embedding]
+
+
+class MockTokenizer:
+    def tokenize(self, text):
+        return text.split()
diff --git a/app/tests/mock/test_mock_sentence_transformer.py b/app/tests/mock/test_mock_sentence_transformer.py
@@ -0,0 +1,28 @@
+from tests.mock.mock_sentence_transformer import MockSentenceTransformer
+
+
+def test_mock_sentence_transformer():
+    embedding_model = MockSentenceTransformer()
+
+    assert embedding_model.max_seq_length == 512
+    assert embedding_model.tokenizer.tokenize("Hello, world!") == ["Hello,", "world!"]
+    assert len(embedding_model.encode("Hello, world!")) == 768
+    # It should be about 1, but with some tolerance for floating point imprecision
+    assert sum(embedding_model.encode("Hello, world!")) - 1 < 0.01
+
+    # Test that we can compare similarity with dot product,
+    # where sentences with the same average length word are considered more similar
+    long_text = embedding_model.encode(
+        "Incomprehensibility characterizes unintelligible, overwhelmingly convoluted dissertations."
+    )
+    medium_text = embedding_model.encode(
+        "Curiosity inspires creative, innovative communities worldwide."
+    )
+    short_text = embedding_model.encode("The quick brown red fox jumps.")
+
+    def dot_product(v1, v2):
+        return sum(x * y for x, y in zip(v1, v2, strict=True))
+
+    assert dot_product(long_text, long_text) > dot_product(long_text, medium_text)
+    assert dot_product(long_text, medium_text) > dot_product(long_text, short_text)
+    assert dot_product(medium_text, medium_text) > dot_product(medium_text, short_text)
diff --git a/docs/app/getting-started.md b/docs/app/getting-started.md
@@ -6,8 +6,6 @@ A very simple [docker-compose.yml](/docker-compose.yml) has been included to sup
 
 ## Prerequisites
 
-**Note:** Run everything from within the `/app` folder:
-
 1. Install the version of Python specified in [.python-version](/app/.python-version)
    [pyenv](https://github.com/pyenv/pyenv#installation) is one popular option for installing Python,
    or [asdf](https://asdf-vm.com/).
@@ -21,18 +19,18 @@ A very simple [docker-compose.yml](/docker-compose.yml) has been included to sup
 
 3. If you are using an M1 mac, you will need to install postgres as well: `brew install postgresql` (The psycopg2-binary is built from source on M1 macs which requires the postgres executable to be present)
 
-4. You'll also need [Docker Desktop](https://www.docker.com/products/docker-desktop/)
+4. You'll also need [Docker Desktop](https://www.docker.com/products/docker-desktop/) installed and running
 
 ## Run the application
 
-1. Set up an (empty) local secrets file: `touch app/.env` and copy the provided example Docker override: `cp docker-compose.override.yml.example docker-compose.override.yml`
-2. In your terminal, `cd` to the `app` directory of this repo.
-3. Make sure you have [Docker Desktop](https://www.docker.com/products/docker-desktop/) installed & running.
-4. Run `make setup-local` to install dependencies
-5. Run `make init start` to build the image and start the container.
-6. Navigate to `localhost:8080/docs` to access the Swagger UI.
-7. Run `make run-logs` to see the logs of the running API container
-8. Run `make stop` when you are done to delete the container.
+**Note:** Run everything from within the `/app` folder:
+
+1. Set up an (empty) local secrets file: `touch .env` and copy the provided example Docker override: `cp ../docker-compose.override.yml.example ../docker-compose.override.yml`
+2. Download the `multi-qa-mpnet-base-dot-v1` model into the `models` directory: `mkdir models && git clone https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-v1 models/multi-qa-mpnet-base-dot-v1`
+3. Run `make init start` to build the image and start the container.
+4. Navigate to `localhost:8080/docs` to access the Swagger UI.
+5. Run `make run-logs` to see the logs of the running API container
+6. Run `make stop` when you are done to delete the container.
 
 ## Next steps
Original file line number	Diff line number	Diff line change
Expand Up		@@ -71,3 +71,5 @@ AWS_SECRET_ACCESS_KEY=DO_NOT_SET_HERE
		#AWS_SESSION_TOKEN=DO_NOT_SET_HERE

		AWS_DEFAULT_REGION=us-east-1

		EMBEDDING_MODEL=/app/models/multi-qa-mpnet-base-dot-v1
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2,4 +2,4 @@


		class AppConfig(PydanticBaseEnvConfig):
		...
		embedding_model: str = "multi-qa-mpnet-base-dot-v1"