Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

Add a Lilac dataset reader. #563

Merged
merged 15 commits into from
Oct 31, 2023
4 changes: 4 additions & 0 deletions llama_hub/library.json
Original file line number Diff line number Diff line change
Expand Up @@ -979,5 +979,9 @@
"repository",
"collaborators"
]
},
"LilacReader": {
"id": "lilac_reader",
"author": "nsthorat"
}
}
66 changes: 66 additions & 0 deletions llama_hub/lilac_reader/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Lilac reader

[Lilac](https://lilacml.com/) is an open-source product that helps you analyze, enrich, and clean unstructured data with AI.

It can be used to analyze, clean, structure, and label data that can be used in downstream LlamaIndex and LangChain applications.

## Lilac projects

This assumes you've already run Lilac locally, and have a project directory with a dataset. For more details on Lilac projects, see [Lilac Projects](https://lilacml.com/projects/projects.html)

You can use any LlamaIndex loader to load data into Lilac, clean data, and then bring it back into LlamaIndex Documents.

## Usage

### LlamaIndex documents

See [this notebook](https://github.com/lilacai/lilac/blob/main/notebooks/LlamaIndexLoader.ipynb) for getting data into Lilac from LlamaHub.

```python
from llama_index import download_loader
import lilac as ll

# See: https://llamahub.ai/l/papers-arxiv
ArxivReader = download_loader("ArxivReader")

loader = ArxivReader()
documents = loader.load_data(search_query='au:Karpathy')

# Set the project directory for Lilac.
ll.set_project_dir('./data')

# This assumes you already have a lilac project set up.
# If you don't, use ll.init(project_dir='./data')
ll.create_dataset(
config=ll.DatasetConfig(
namespace='local',
name='arxiv-karpathy',
source=ll.LlamaIndexDocsSource(
# documents comes from the loader.load_data call in the previous cell.
documents=documents,)))

# You can start a lilac server with. Once you've cleaned the dataset, you can come back into GPTIndex.
ll.start_server(project_dir='./data')
```



### Lilac => LlamaIndex Documents

```python
from llama_index import GPTVectorStoreIndex, download_loader

LilacReader = download_loader('LilacReader')

loader = LilacReader()
documents = loader.load_data(
project_dir='~/my_project',
# The name of your dataset in the project dir.
dataset='local/arxiv-karpathy')

index = GPTVectorStoreIndex.from_documents(documents)

index.query("How are ImagNet labels validated?")
```

This loader is designed to be used as a way to load data into [GPT Index](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used in a [LangChain](https://github.com/hwchase17/langchain) Agent.
1 change: 1 addition & 0 deletions llama_hub/lilac_reader/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Init file."""
nsthorat marked this conversation as resolved.
Show resolved Hide resolved
100 changes: 100 additions & 0 deletions llama_hub/lilac_reader/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
"""Lilac reader that loads enriched and labeled Lilac datasets into GPTIndex and LangChain."""
from typing import List, Optional
from typing import TYPE_CHECKING, Optional

from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document

if TYPE_CHECKING:
from lilac import FilterLike, Path, ColumnId

class LilacReader(BaseReader):
"""
Lilac dataset reader
"""

def load_data(
self,
dataset: str,
text_path: "Path" = 'text',
doc_id_path: Optional["Path"] = 'doc_id',
columns: Optional[List['ColumnId']] = None,
filters: Optional[list["FilterLike"]] = None,
nsthorat marked this conversation as resolved.
Show resolved Hide resolved
project_dir: Optional[str] = None,
) -> List[Document]:
"""
Load text from relevant posts and top-level comments in subreddit(s), given keyword(s) for search

Args:
project_dir (Optional[str]): The Lilac project dir to read from. If not defined, uses the `LILAC_PROJECT_DIR`
environment variable.
text_path: The path to the text field in the dataset. If not defined, uses 'text'.
columns (Optional[List[ColumnId]]): The columns to load from the dataset. If not defined, loads all columns.
dataset (str): The dataset to load. Should be formatted like {namespace}/{dataset_name}.
filters (Optional[Filter]): A filter to apply to the dataset before loading into documents. Useful to filter
for labeled data.

"""

import lilac as ll
nsthorat marked this conversation as resolved.
Show resolved Hide resolved

namespace, dataset_name = dataset.split("/")
lilac_dataset = ll.get_dataset(namespace, dataset_name, project_dir=project_dir)

# Check to make sure text path, and doc_id path are valid.
manifest = lilac_dataset.manifest()

text_path = ll.normalize_path(text_path)
text_field = manifest.data_schema.get_field(text_path)
if not text_field:
raise ValueError(f"Could not find text field {text_path} in dataset {dataset}")

doc_id_path = ll.normalize_path(doc_id_path)
doc_id_field = manifest.data_schema.get_field(doc_id_path)
if not doc_id_field:
raise ValueError(f"Could not find doc_id field {doc_id_path} in dataset {dataset}")

rows = lilac_dataset.select_rows(
columns=(columns + [text_field, doc_id_path]) if columns else ['*'],
filters=filters,
combine_columns=True)

def _item_from_path(item: ll.Item, path: ll.PathTuple) -> ll.Item:
if len(path) == 1:
item = item[path[0]]
if isinstance(item, dict):
return item[ll.VALUE_KEY]
else:
return item
else:
return _item_from_path(item[path[0]], path[1:])

def _remove_item_path(item: ll.Item, path: ll.PathTuple) -> None:
if len(path) == 0:
return
if len(path) == 1:
if item and path[0] in item:
leaf_item = item[path[0]]
if isinstance(leaf_item, dict):
del item[path[0]][ll.VALUE_KEY]
else:
del item[path[0]]
return
else:
_remove_item_path(item[path[0]], path[1:])

documents: list[Document] = []
nsthorat marked this conversation as resolved.
Show resolved Hide resolved
for row in rows:
text = _item_from_path(row, text_path)
doc_id = _item_from_path(row, doc_id_path)
_remove_item_path(row, text_path)
_remove_item_path(row, doc_id_path)
documents.append(
Document(
text=text,
doc_id=doc_id,
extra_info=row or {}
))

return documents

1 change: 1 addition & 0 deletions llama_hub/lilac_reader/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
lilac~=0.1.5
Loading