run-llama · EmanuelCampos · Oct 31, 2023 · Oct 5, 2023 · Oct 5, 2023 · Oct 5, 2023
diff --git a/llama_hub/library.json b/llama_hub/library.json
@@ -979,5 +979,9 @@
       "repository",
       "collaborators"
     ]
+  },
+  "LilacReader": {
+    "id": "lilac_reader",
+    "author": "nsthorat"
   }
 }
diff --git a/llama_hub/lilac_reader/README.md b/llama_hub/lilac_reader/README.md
@@ -0,0 +1,66 @@
+# Lilac reader
+
+[Lilac](https://lilacml.com/) is an open-source product that helps you analyze, enrich, and clean unstructured data with AI.
+
+It can be used to analyze, clean, structure, and label data that can be used in downstream LlamaIndex and LangChain applications.
+
+## Lilac projects
+
+This assumes you've already run Lilac locally, and have a project directory with a dataset. For more details on Lilac projects, see [Lilac Projects](https://lilacml.com/projects/projects.html)
+
+You can use any LlamaIndex loader to load data into Lilac, clean data, and then bring it back into LlamaIndex Documents.
+
+## Usage
+
+### LlamaIndex documents
+
+See [this notebook](https://github.com/lilacai/lilac/blob/main/notebooks/LlamaIndexLoader.ipynb) for getting data into Lilac from LlamaHub.
+
+```python
+from llama_index import download_loader
+import lilac as ll
+
+# See: https://llamahub.ai/l/papers-arxiv
+ArxivReader = download_loader("ArxivReader")
+
+loader = ArxivReader()
+documents = loader.load_data(search_query='au:Karpathy')
+
+# Set the project directory for Lilac.
+ll.set_project_dir('./data')
+
+# This assumes you already have a lilac project set up.
+# If you don't, use ll.init(project_dir='./data')
+ll.create_dataset(
+  config=ll.DatasetConfig(
+    namespace='local',
+    name='arxiv-karpathy',
+    source=ll.LlamaIndexDocsSource(
+      # documents comes from the loader.load_data call in the previous cell.
+      documents=documents,)))
+
+# You can start a lilac server with. Once you've cleaned the dataset, you can come back into GPTIndex.
+ll.start_server(project_dir='./data')
+```
+
+
+
+### Lilac => LlamaIndex Documents
+
+```python
+from llama_index import GPTVectorStoreIndex, download_loader
+
+LilacReader = download_loader('LilacReader')
+
+loader = LilacReader()
+documents = loader.load_data(
+    project_dir='~/my_project',
+    # The name of your dataset in the project dir.
+    dataset='local/arxiv-karpathy')
+
+index = GPTVectorStoreIndex.from_documents(documents)
+
+index.query("How are ImagNet labels validated?")
+```
+
+This loader is designed to be used as a way to load data into [GPT Index](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used in a [LangChain](https://github.com/hwchase17/langchain) Agent.
diff --git a/llama_hub/lilac_reader/__init__.py b/llama_hub/lilac_reader/__init__.py
@@ -0,0 +1 @@
+"""Init file."""
diff --git a/llama_hub/lilac_reader/base.py b/llama_hub/lilac_reader/base.py
@@ -0,0 +1,100 @@
+"""Lilac reader that loads enriched and labeled Lilac datasets into GPTIndex and LangChain."""
+from typing import List, Optional
+from typing import TYPE_CHECKING, Optional
+
+from llama_index.readers.base import BaseReader
+from llama_index.readers.schema.base import Document
+
+if TYPE_CHECKING:
+    from lilac import FilterLike, Path, ColumnId
+
+class LilacReader(BaseReader):
+    """
+    Lilac dataset reader
+    """
+
+    def load_data(
+        self,
+        dataset: str,
+        text_path: "Path" = 'text',
+        doc_id_path: Optional["Path"] = 'doc_id',
+        columns: Optional[List['ColumnId']] = None,
+        filters: Optional[list["FilterLike"]] = None,
+        project_dir: Optional[str] = None,
+    ) -> List[Document]:
+        """
+        Load text from relevant posts and top-level comments in subreddit(s), given keyword(s) for search
+
+        Args:
+            project_dir (Optional[str]): The Lilac project dir to read from. If not defined, uses the `LILAC_PROJECT_DIR`
+              environment variable.
+            text_path: The path to the text field in the dataset. If not defined, uses 'text'.
+            columns (Optional[List[ColumnId]]): The columns to load from the dataset. If not defined, loads all columns.
+            dataset (str): The dataset to load. Should be formatted like {namespace}/{dataset_name}.
+            filters (Optional[Filter]): A filter to apply to the dataset before loading into documents. Useful to filter
+              for labeled data.
+
+        """
+
+        import lilac as ll
+
+        namespace, dataset_name = dataset.split("/")
+        lilac_dataset = ll.get_dataset(namespace, dataset_name, project_dir=project_dir)
+
+        # Check to make sure text path, and doc_id path are valid.
+        manifest = lilac_dataset.manifest()
+
+        text_path = ll.normalize_path(text_path)
+        text_field = manifest.data_schema.get_field(text_path)
+        if not text_field:
+            raise ValueError(f"Could not find text field {text_path} in dataset {dataset}")
+
+        doc_id_path = ll.normalize_path(doc_id_path)
+        doc_id_field = manifest.data_schema.get_field(doc_id_path)
+        if not doc_id_field:
+            raise ValueError(f"Could not find doc_id field {doc_id_path} in dataset {dataset}")
+
+        rows = lilac_dataset.select_rows(
+            columns=(columns + [text_field, doc_id_path]) if columns else ['*'],
+            filters=filters,
+            combine_columns=True)
+
+        def _item_from_path(item: ll.Item, path: ll.PathTuple) -> ll.Item:
+            if len(path) == 1:
+                item = item[path[0]]
+                if isinstance(item, dict):
+                    return item[ll.VALUE_KEY]
+                else:
+                    return item
+            else:
+                return _item_from_path(item[path[0]], path[1:])
+
+        def _remove_item_path(item: ll.Item, path: ll.PathTuple) -> None:
+            if len(path) == 0:
+                return
+            if len(path) == 1:
+                if item and path[0] in item:
+                    leaf_item = item[path[0]]
+                    if isinstance(leaf_item, dict):
+                        del item[path[0]][ll.VALUE_KEY]
+                    else:
+                        del item[path[0]]
+                return
+            else:
+                _remove_item_path(item[path[0]], path[1:])
+
+        documents: list[Document] = []
+        for row in rows:
+            text = _item_from_path(row, text_path)
+            doc_id = _item_from_path(row, doc_id_path)
+            _remove_item_path(row, text_path)
+            _remove_item_path(row, doc_id_path)
+            documents.append(
+                Document(
+                    text=text,
+                    doc_id=doc_id,
+                    extra_info=row or {}
+                    ))
+
+        return documents
+
diff --git a/llama_hub/lilac_reader/requirements.txt b/llama_hub/lilac_reader/requirements.txt
@@ -0,0 +1 @@
+lilac~=0.1.5
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Init file."""
nsthorat marked this conversation as resolved. Show resolved Hide resolved