Spaces:

SiddharthAK
/

TextLSRDemo

Running

App Files Files Community

SiddharthAK commited on 21 days ago

Commit

44519b1

verified ·

1 Parent(s): c3ddf27

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -16

app.py CHANGED Viewed

@@ -53,17 +53,19 @@ document_texts = {}           # Stores {doc_id: doc_text}
 initial_doc_model_for_indexing = "SPLADE-cocondenser-distil" # Fixed for initial demo index
-# --- Load SciFact Corpus using ir_datasets ---
-def load_scifact_corpus_ir_datasets():
     global document_texts
-    print("Loading SciFact corpus using ir_datasets...")
     try:
-        dataset = ir_datasets.load("scifact")
-        for doc in tqdm(dataset.docs_iter(), desc="Loading SciFact documents"):
             document_texts[doc.doc_id] = doc.text.strip()
-        print(f"Loaded {len(document_texts)} documents from SciFact corpus.")
     except Exception as e:
-        print(f"Error loading SciFact corpus with ir_datasets: {e}")
         print("Please ensure 'ir_datasets' is installed and your internet connection is stable.")
@@ -88,8 +90,6 @@ def create_lexical_bow_mask(input_ids, vocab_size, tokenizer):
 # --- Core Representation Functions (Return Formatted Strings - for Explorer Tab) ---
-# These are your original functions, re-added.
 def get_splade_cocondenser_representation(text):
     if tokenizer_splade is None or model_splade is None:
         return "SPLADE-cocondenser-distil model is not loaded. Please check the console for loading errors."
@@ -254,8 +254,6 @@ def predict_representation_explorer(model_choice, text):
 # --- Internal Core Representation Functions (Return Raw Vectors - for Retrieval Tab) ---
-# These are the ones ending with _internal, as previously defined.
 def get_splade_cocondenser_representation_internal(text, tokenizer, model):
     if tokenizer is None or model is None: return None
     inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
@@ -400,7 +398,8 @@ def predict_retrieval_gradio(query_text, query_model_choice, selected_doc_model_
 # --- Initial Load and Indexing Calls ---
 # This part runs once when the app starts.
-load_scifact_corpus_ir_datasets() # Or load_cranfield_corpus_ir_datasets() if you switch back
 if initial_doc_model_for_indexing == "SPLADE-cocondenser-distil" and model_splade is not None:
     index_documents(initial_doc_model_for_indexing)
@@ -440,13 +439,11 @@ with gr.Blocks(title="SPLADE Demos") as demo:
                 ],
                 outputs=gr.Markdown(),
                 allow_flagging="never",
-                # Don't show redundant title/description within the tab, as it's above
-                # Setting live=True might be slow for complex models on every keystroke
-                # live=True
             )
         with gr.TabItem("Document Retrieval Demo"):
-            gr.Markdown("### Retrieve Documents from SciFact Collection")
             gr.Interface(
                 fn=predict_retrieval_gradio,
                 inputs=[

 initial_doc_model_for_indexing = "SPLADE-cocondenser-distil" # Fixed for initial demo index
+# --- Load Cranfield Corpus using ir_datasets ---
+# Renamed function for clarity, but kept original name for call consistency
+def load_cranfield_corpus_ir_datasets():
     global document_texts
+    print("Loading Cranfield corpus using ir_datasets...")
     try:
+        # --- IMPORTANT CHANGE: Loading 'cranfield' dataset ---
+        dataset = ir_datasets.load("cranfield")
+        for doc in tqdm(dataset.docs_iter(), desc="Loading Cranfield documents"):
             document_texts[doc.doc_id] = doc.text.strip()
+        print(f"Loaded {len(document_texts)} documents from Cranfield corpus.")
     except Exception as e:
+        print(f"Error loading Cranfield corpus with ir_datasets: {e}")
         print("Please ensure 'ir_datasets' is installed and your internet connection is stable.")
 # --- Core Representation Functions (Return Formatted Strings - for Explorer Tab) ---
 def get_splade_cocondenser_representation(text):
     if tokenizer_splade is None or model_splade is None:
         return "SPLADE-cocondenser-distil model is not loaded. Please check the console for loading errors."
 # --- Internal Core Representation Functions (Return Raw Vectors - for Retrieval Tab) ---
 def get_splade_cocondenser_representation_internal(text, tokenizer, model):
     if tokenizer is None or model is None: return None
     inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
 # --- Initial Load and Indexing Calls ---
 # This part runs once when the app starts.
+# --- IMPORTANT CHANGE: Calling the function that loads Cranfield ---
+load_cranfield_corpus_ir_datasets()
 if initial_doc_model_for_indexing == "SPLADE-cocondenser-distil" and model_splade is not None:
     index_documents(initial_doc_model_for_indexing)
                 ],
                 outputs=gr.Markdown(),
                 allow_flagging="never",
+                # live=True # Setting live=True might be slow for complex models on every keystroke
             )
         with gr.TabItem("Document Retrieval Demo"):
+            gr.Markdown("### Retrieve Documents from Cranfield Collection") # Changed title
             gr.Interface(
                 fn=predict_retrieval_gradio,
                 inputs=[