import os import logging import torch from tenacity import retry, stop_after_attempt, wait_random_exponential, stop_after_delay from tqdm import tqdm from dotenv import load_dotenv from langchain.embeddings import LlamaCppEmbeddings from langchain.text_splitter import CharacterTextSplitter from langchain.document_loaders import DirectoryLoader from langchain.embeddings.openai import OpenAIEmbeddings from langchain.vectorstores import Chroma from langchain.chains import ConversationalRetrievalChain from langchain.chat_models import ChatOpenAI from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM # Configure logging logging.basicConfig(level=logging.INFO) # Set up OpenAI API key load_dotenv(dotenv_path='keys.env') openai_api_key = os.getenv('OPENAI_API_KEY') os.environ['OPENAI_API_KEY'] = openai_api_key class LocalEmbeddings: def __init__(self, model_dir): self.model = AutoModelForCausalLM.from_pretrained(model_dir) self.tokenizer = AutoTokenizer.from_pretrained(model_dir) self.tokenizer.add_special_tokens({'pad_token': '[PAD]'}) def embed_documents(self, texts): embeddings = [] for text in texts: inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True) with torch.no_grad(): outputs = self.model(**inputs, output_hidden_states=True) # Request hidden states last_layer_hidden_states = outputs.hidden_states[-1] document_embedding = last_layer_hidden_states.mean(dim=1) embeddings.append(document_embedding.numpy().tolist()) return embeddings class CustomDirectoryLoader: def __init__(self, directory_path, **kwargs): self.directory_path = directory_path self.kwargs = kwargs self.documents = [] def load(self): file_types = { 'pdf': 'PDFFiles', 'xlsx': 'ExeclFiles', 'txt': 'TXTFiles', 'pptx': 'PowerPointFiles', 'csv': 'CSVFiles', 'docx': 'WordFiles', } loaders = [] for ext, folder in file_types.items(): path = os.path.join(self.directory_path, folder) glob_pattern = f"**/*.{ext}" try: loader = DirectoryLoader(path, glob=glob_pattern, show_progress=True, max_concurrency=24, use_multithreading=True, **self.kwargs) loaders.append(loader) except Exception as e: logging.error(f"Failed to create loader for {folder}: {e}") for loader in tqdm(loaders, desc="Loading documents"): try: self.documents.extend(loader.load()) logging.info(f"Successfully loaded documents from {loader}") except Exception as e: logging.error(f"Failed to load documents: {e}") def split_by_chunks(self): self.load() text_splitter = CharacterTextSplitter(chunk_size=5000, chunk_overlap=0) split_documents = text_splitter.split_documents(self.documents) return split_documents def initialize_embeddings(): embeddings = OpenAIEmbeddings() return embeddings def load_and_split_documents(directory_path): directory_loader = CustomDirectoryLoader(directory_path) loaded_documents = directory_loader.split_by_chunks() return loaded_documents def initialize_vectorstore(documents, embeddings): vectorstore = Chroma.from_documents(documents, embeddings, persist_directory="./chroma_db3") return vectorstore def interactive_chat(chain): chat_history = [] while True: query = input("Your question: ") if query.lower() in ['quit', 'exit', 'q']: print("Exiting. Goodbye!") break result = chain({"question": query, "chat_history": chat_history}) print("Answer:", result['answer']) chat_history.append((query, result['answer'])) def main(): directory_path = "../data/" llama_model_path = '../embeddings/llama7B/' embeddings = initialize_embeddings() loaded_documents = load_and_split_documents(directory_path) vectorstore = initialize_vectorstore(loaded_documents, embeddings) retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"fetch_k": 10, "lambda_mult": 0.3}) chain = ConversationalRetrievalChain.from_llm(ChatOpenAI(model_name="gpt-3.5-turbo-16k"), retriever) interactive_chat(chain) if __name__ == "__main__": main()