52100303-TranPhuocSang's picture
Update model, RAG with CTransformer
24988f9
raw
history blame contribute delete
No virus
1.59 kB
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_ai21 import AI21SemanticTextSplitter
from dotenv import load_dotenv
import re
import os
load_dotenv()
pdf_data_path = './documents'
vector_db_path = './db'
model_name = 'bkai-foundation-models/vietnamese-bi-encoder'
AI21_TOKEN = os.getenv('AI21_TOKEN')
os.environ["AI21_API_KEY"] = AI21_TOKEN
def clean_text(text):
text = re.sub(r'[^\w\s,.-]', '', text)
text = re.sub(r'\s+', ' ', text).strip()
text = text.replace(" \n", "\n").replace("\n ", "\n").replace("\n", "\n\n")
return text
def create_db_from_files():
loader = DirectoryLoader(pdf_data_path, glob="*.pdf", loader_cls = PyPDFLoader)
documents = loader.load()
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=128)
text_splitter = AI21SemanticTextSplitter(chunk_size=1024, chunk_overlap=128)
chunks = text_splitter.split_documents(documents)
for chunk in chunks:
chunk.page_content = clean_text(chunk.page_content)
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
)
db = FAISS.from_documents(chunks, embeddings)
db.save_local(vector_db_path)
return db
create_db_from_files()