from langchain_community.document_loaders import PyPDFDirectoryLoader from typing import Optional, Dict from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import Chroma from langchain_community.embeddings import HuggingFaceEmbeddings import warnings warnings.simplefilter("ignore") def create_retriever( pdf_directory: str, chunk_size: int = 1000, chunk_overlap: int = 100, embedding_model_name: str = "sentence-transformers/all-mpnet-base-v2", model_kwargs: Optional[Dict[str, str]] = {"device": "cpu"}, ): """ Creates and returns a retriever object based on the provided PDF directory and configurations. Args: - pdf_directory (str): Path to the directory containing PDF files. - chunk_size (int): Size of each chunk for splitting documents. - chunk_overlap (int): Overlap size between adjacent chunks. - embedding_model_name (str): Name of the HuggingFace embedding model to be used. - model_kwargs (dict, optional): Additional keyword arguments for the embedding model. Returns: - retriever (Retriever): Retriever object for retrieving documents. Raises: - ValueError: If input values are invalid. """ if chunk_size <= 0: raise ValueError("Chunk size must be a positive integer.") if chunk_overlap < 0 or chunk_overlap >= chunk_size: raise ValueError( "Chunk overlap must be a non-negative integer less than the chunk size." ) # Load documents loader = PyPDFDirectoryLoader(pdf_directory) documents = loader.load() # Split documents into small chunks text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap ) all_splits = text_splitter.split_documents(documents) # Specify embedding model embeddings = HuggingFaceEmbeddings( model_name=embedding_model_name, model_kwargs=model_kwargs ) # Embed document chunks vectordb = Chroma.from_documents( documents=all_splits, embedding=embeddings, persist_directory="chroma_db" ) # Create and return retriever retriever = vectordb.as_retriever() return retriever