database_handler.py

"""
CLI handler to create or update the Chroma database.

Author
------
Nicolas Rojas
"""

# import libraries
from argparse import ArgumentParser
import os.path
from datetime import datetime
import yaml
import chromadb
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    Settings,
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore


def create_index(
    chroma_collection_name: str,
    documents_dir: str,
    index_dir: str,
    embedding_model: str,
):
    """Create vector database from documents folder.

    Parameters
    ----------
    chroma_collection_name : str
        Name of the Chroma collection to be created.
    documents_dir : str
        Directory where the documents are stored.
    index_dir : str
        Directory where the index is saved.
    embedding_model : str
        Huggingface embedding model to vectorize the documents.
    """
    # create Chroma vector store
    chroma_client = chromadb.PersistentClient(path=index_dir)
    chroma_collection = chroma_client.get_or_create_collection(chroma_collection_name)
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    # load the documents and create the index
    embed_model = HuggingFaceEmbedding(model_name=embedding_model)
    documents = SimpleDirectoryReader(documents_dir).load_data()
    index = VectorStoreIndex.from_documents(
        documents, storage_context=storage_context, embed_model=embed_model
    )


def update_index(
    chroma_collection_name: str,
    documents_dir: str,
    index_dir: str,
    embedding_model: str,
):
    """Update vector database with new or changed files.

    Parameters
    ----------
    chroma_collection_name : str
        Name of the Chroma collection to be updated.
    documents_dir : str
        Directory where the documents are stored.
    index_dir : str
        Directory where the index is saved.
    embedding_model : str
        Huggingface embedding model to vectorize the documents.
    """
    # load the existing index
    chroma_client = chromadb.PersistentClient(path=index_dir)
    chroma_collection = chroma_client.get_or_create_collection(chroma_collection_name)
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    embed_model = HuggingFaceEmbedding(model_name=embedding_model)
    index = VectorStoreIndex.from_vector_store(
        vector_store,
        embed_model=embed_model,
    )
    documents = SimpleDirectoryReader(documents_dir).load_data()

    for doc in documents:
        doc_id = doc.metadata["file_path"]

        # Check if document already exists in the index
        existing_node = index.docstore.document_exists(doc_id)

        if existing_node:
            existing_mtime = datetime.fromisoformat(
                existing_node.metadata["last_modified"]
            )
            current_mtime = datetime.fromtimestamp(os.path.getmtime(doc_id))

            # If the file has been modified, update it
            if current_mtime > existing_mtime:
                index.update_ref_doc(doc_id, doc)
        else:
            # If the document doesn't exist, insert it
            index.insert(doc)
    # Persist changes
    index.storage_context.persist(persist_dir=index_dir)


if __name__ == "__main__":
    parser = ArgumentParser(
        description="Utility to manage database index with LlamaIndex and Chroma"
    )
    parser.add_argument(
        "-c", "--create", action="store_true", help="Create the database"
    )
    parser.add_argument(
        "-u", "--update", action="store_true", help="Update the database"
    )
    args = parser.parse_args()
    # load configuration variables
    with open("config.yaml", "r", encoding="utf8") as yfile:
        parameters = yaml.safe_load(yfile)

    index_dir = parameters["index_directory"]
    chunk_size = parameters["chunk_size"]
    embedding_model = parameters["embedding_model"]
    chroma_collection = parameters["chroma_collection"]
    documents_dir = parameters["documents_dir"]

    # Set custom RAG settings
    Settings.chunk_size = chunk_size
    Settings.embed_model = HuggingFaceEmbedding(model_name=embedding_model)

    # if both arguments are true or both are false, throw error
    if args.create == args.update:
        raise ValueError(
            "Use the program with argument -c OR -u. Use flag -h for help."
        )

    # create database if doesnt exist yet
    if args.create:
        if os.path.exists(index_dir):
            raise FileExistsError(f"The file {index_dir} already exists")
        create_index(chroma_collection, documents_dir, index_dir, embedding_model)

    # update existing database
    elif args.update:
        if not os.path.exists(index_dir):
            raise FileNotFoundError(f"The file {index_dir} does not exist")
        update_index(chroma_collection, documents_dir, index_dir, embedding_model)