import pandas as pd import fitz # PyMuPDF for PDF extraction import spacy from nltk.corpus import stopwords from transformers import AutoTokenizer, AutoModel import torch import gradio as gr import numpy as np from faiss import IndexFlatL2, normalize_L2 from langchain.llms import OpenAI from langchain.chains import ConversationalRetrievalChain # Load and preprocess PDF text def extract_text_from_pdf(pdf_path): text = "" with fitz.open(pdf_path) as pdf_document: for page_num in range(len(pdf_document)): page = pdf_document.load_page(page_num) text += page.get_text() return text # Extract text from the PDF pdf_path = 'Getting_Started_with_Ubuntu_16.04.pdf' # Reference to the PDF file in the same directory pdf_text = extract_text_from_pdf(pdf_path) # Convert the text to a DataFrame df = pd.DataFrame({'text': [pdf_text]}) # Load the custom embedding model class CustomEmbeddingModel: def __init__(self, model_name): self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.model = AutoModel.from_pretrained(model_name) def embed_text(self, text): inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) with torch.no_grad(): embeddings = self.model(**inputs).last_hidden_state.mean(dim=1) return embeddings[0].numpy() embedding_model = CustomEmbeddingModel('FridayMaster/fine_tune_embedding') # Replace with your model name # Load Spacy model for preprocessing nlp = spacy.load("en_core_web_sm") def preprocess_text(text): doc = nlp(text) tokens = [token.lemma_.lower() for token in doc if token.text.lower() not in stopwords.words('english') and token.is_alpha] return ' '.join(tokens) # Apply preprocessing and embedding df['text'] = df['text'].apply(preprocess_text) df['text_embeddings'] = df['text'].apply(lambda x: embedding_model.embed_text(x)) # Create FAISS vector store class SimpleFAISSIndex: def __init__(self, embeddings): self.index = IndexFlatL2(embeddings.shape[1]) normalize_L2(embeddings) self.index.add(embeddings) def search(self, query_embedding, k=1): normalize_L2(query_embedding) distances, indices = self.index.search(query_embedding, k) return indices[0], distances[0] embeddings = np.array(df['text_embeddings'].tolist()) vector_store = SimpleFAISSIndex(embeddings) # Create LangChain model and chain llm_model = OpenAI('gpt-3.5-turbo') # You can replace this with a different LLM if desired class SimpleRetriever: def __init__(self, vector_store, documents): self.vector_store = vector_store self.documents = documents def retrieve(self, query): query_embedding = embedding_model.embed_text(query).reshape(1, -1) indices, _ = self.vector_store.search(query_embedding) return [self.documents[idx] for idx in indices] retriever = SimpleRetriever(vector_store, df['text'].tolist()) chain = ConversationalRetrievalChain.from_llm(llm_model, retriever=retriever) # Function to generate a response def generate_response(prompt): result = chain({"query": prompt}) response = result["result"] return response # Gradio interface iface = gr.Interface( fn=generate_response, inputs=gr.Textbox(label="Enter your query", placeholder="Ask about Ubuntu..."), outputs=gr.Textbox(label="Response"), title="Ubuntu Manual Chatbot", description="Ask questions about the Ubuntu manual." ) if __name__ == "__main__": iface.launch()