Skip to content

Commit

Permalink
splitters need to be warmed up (#173)
Browse files Browse the repository at this point in the history
* splitters need to be warmed up

* adding a test where HierarchicalDocumentSplitter is called in a pipeline

* adding NLTK as extra dependency

* fixing typo
  • Loading branch information
davidsbatista authored Jan 22, 2025
1 parent 1477960 commit 49ab954
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def _build_block_sizes(self):
self.splitters[block_size] = DocumentSplitter(
split_length=block_size, split_overlap=self.split_overlap, split_by=self.split_by
)
self.splitters[block_size].warm_up()

@staticmethod
def _add_meta_data(document: Document):
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ extra-dependencies = [
# LLMMetadataExtractor dependencies
"amazon-bedrock-haystack>=1.1.1",
"google-vertex-haystack>=2.0.0",
# HierachicalSplitter w/ split_by="sentence"
"nltk"
]

[tool.hatch.envs.test.scripts]
Expand Down
21 changes: 21 additions & 0 deletions test/components/splitters/test_hierarchical_doc_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,3 +213,24 @@ def test_serialization_deserialization_pipeline(self):

new_pipeline = Pipeline.from_dict(pipeline_dict)
assert new_pipeline == pipeline

def test_split_by_sentence_assure_warm_up_was_called(self):
pipeline = Pipeline()
hierarchical_doc_builder = HierarchicalDocumentSplitter(
block_sizes={10, 5, 2}, split_overlap=0, split_by="sentence"
)
doc_store = InMemoryDocumentStore()
doc_writer = DocumentWriter(document_store=doc_store)

pipeline.add_component(
name="hierarchical_doc_splitter", instance=hierarchical_doc_builder
)
pipeline.add_component(name="doc_writer", instance=doc_writer)
pipeline.connect("hierarchical_doc_splitter.documents", "doc_writer")

text = "This is one sentence. This is another sentence. This is the third sentence."
doc = Document(content=text)
docs = pipeline.run({"hierarchical_doc_splitter": {"documents": [doc]}})

assert docs["doc_writer"]["documents_written"] == 3
assert len(doc_store.storage.values()) == 3

0 comments on commit 49ab954

Please sign in to comment.