Spaces:

miki5799
/

nlp4web

Sleeping

App Files Files Community

miki5799 commited on Nov 7, 2024

Commit

d8d5586

1 Parent(s): 3c1805a

Refactor app.py for improved readability and organization; rearranged imports, added spacing, and formatted code blocks.

Browse files

Files changed (1) hide show

app.py +78 -28

app.py CHANGED Viewed

@@ -1,12 +1,15 @@
-from dataclasses import dataclass
-import pickle
 import os
-from typing import Iterable, Callable, List, Dict, Optional, Type, TypeVar
-from nlp4web_codebase.ir.data_loaders.dm import Document
-from collections import Counter
-import tqdm
 import re
 import nltk
 nltk.download("stopwords", quiet=True)
 from nltk.corpus import stopwords as nltk_stopwords
@@ -18,22 +21,30 @@ stopwords = set(nltk_stopwords.words(LANGUAGE))
 def word_splitting(text: str) -> List[str]:
     return word_splitter(text.lower())
 def lemmatization(words: List[str]) -> List[str]:
     return words  # We ignore lemmatization here for simplicity
 def simple_tokenize(text: str) -> List[str]:
     words = word_splitting(text)
     tokenized = list(filter(lambda w: w not in stopwords, words))
     tokenized = lemmatization(tokenized)
     return tokenized
 T = TypeVar("T", bound="InvertedIndex")
 @dataclass
 class PostingList:
     term: str  # The term
-    docid_postings: List[int]  # docid_postings[i] means the docid (int) of the i-th associated posting
-    tweight_postings: List[float]  # tweight_postings[i] means the term weight (float) of the i-th associated posting
 @dataclass
@@ -72,6 +83,7 @@ class Counting:
     nterms: int
     doc_texts: Optional[List[str]] = None
 def run_counting(
     documents: Iterable[Document],
     tokenize_fn: Callable[[str], List[str]] = simple_tokenize,
@@ -131,22 +143,23 @@ def run_counting(
         doc_texts=doc_texts,
     )
 from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
 sciq = load_sciq()
 counting = run_counting(documents=iter(sciq.corpus), ndocs=len(sciq.corpus))
 from __future__ import annotations
-from dataclasses import asdict, dataclass
 import math
-import os
 from typing import Iterable, List, Optional, Type
-import tqdm
 from nlp4web_codebase.ir.data_loaders.dm import Document
 @dataclass
 class BM25Index(InvertedIndex):
     @staticmethod
     def tokenize(text: str) -> List[str]:
         return simple_tokenize(text)
@@ -230,6 +243,7 @@ class BM25Index(InvertedIndex):
         )
         return index
 bm25_index = BM25Index.build_from_documents(
     documents=iter(sciq.corpus),
     ndocs=12160,
@@ -237,13 +251,13 @@ bm25_index = BM25Index.build_from_documents(
 )
 bm25_index.save("output/bm25_index")
-from nlp4web_codebase.ir.models import BaseRetriever
-from typing import Type
 from abc import abstractmethod
-class BaseInvertedIndexRetriever(BaseRetriever):
     @property
     @abstractmethod
     def index_class(self) -> Type[InvertedIndex]:
@@ -295,16 +309,48 @@ class BaseInvertedIndexRetriever(BaseRetriever):
 class BM25Retriever(BaseInvertedIndexRetriever):
     @property
     def index_class(self) -> Type[BM25Index]:
         return BM25Index
 bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
-bm25_retriever.retrieve("What type of diseases occur when the immune system attacks normal body cells?")
-plots_b = {'X': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'Y': [0.694980045351474, 0.8126195011337869, 0.821528798185941, 0.8218562358276644, 0.8222244897959182, 0.8195024943310657, 0.8182163265306123, 0.8174734693877551, 0.8139020408163266, 0.8116893424036281, 0.8083002267573697]} #TODO: Replace
-plots_k1 = {'X': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'Y': [0.7345419501133786, 0.7668607709750567, 0.779508843537415, 0.7900947845804988, 0.8015931972789115, 0.8103560090702948, 0.812374149659864, 0.8156743764172336, 0.8194036281179138, 0.8222244897959182, 0.8221800453514739]}
 best_b = plots_b["X"][np.argmax(plots_b["Y"])]
 best_k1 = plots_k1["X"][np.argmax(plots_k1["Y"])]
@@ -313,23 +359,26 @@ bm25_index = BM25Index.build_from_documents(
     ndocs=12160,
     show_progress_bar=True,
     k1=best_k1,
-    b=best_b
 )
-import gradio as gr
 from typing import TypedDict
-import pandas as pd
 class Hit(TypedDict):
-  cid: str
-  score: float
-  text: str
 demo: Optional[gr.Interface] = None  # Assign your gradio demo to this variable
 return_type = List[Hit]
 ## YOUR_CODE_STARTS_HERE
-def retrieve(query: str, topk: int=10) -> return_type:
     ranking = bm25_retriever.retrieve(query=query, topk=3)
     hits = []
     for cid, score in ranking.items():
@@ -337,6 +386,7 @@ def retrieve(query: str, topk: int=10) -> return_type:
         hits.append({"cid": cid, "score": score, "text": text})
     return hits
 demo = gr.Interface(
     fn=retrieve,
     inputs=gr.Textbox(lines=3, placeholder="Enter your query here..."),
@@ -347,7 +397,7 @@ demo = gr.Interface(
         ["What are the differences between immunodeficiency and autoimmune diseases?"],
         ["What are the causes of immunodeficiency?"],
         ["What are the symptoms of immunodeficiency?"],
-    ]
 )
 ## YOUR_CODE_ENDS_HERE
-demo.launch()

 import os
+import pickle
 import re
+from collections import Counter
+from dataclasses import dataclass
+from typing import Callable, Dict, Iterable, List, Optional, Type, TypeVar
 import nltk
+import tqdm
+from nlp4web_codebase.ir.data_loaders.dm import Document
 nltk.download("stopwords", quiet=True)
 from nltk.corpus import stopwords as nltk_stopwords
 def word_splitting(text: str) -> List[str]:
     return word_splitter(text.lower())
 def lemmatization(words: List[str]) -> List[str]:
     return words  # We ignore lemmatization here for simplicity
 def simple_tokenize(text: str) -> List[str]:
     words = word_splitting(text)
     tokenized = list(filter(lambda w: w not in stopwords, words))
     tokenized = lemmatization(tokenized)
     return tokenized
 T = TypeVar("T", bound="InvertedIndex")
 @dataclass
 class PostingList:
     term: str  # The term
+    docid_postings: List[
+        int
+    ]  # docid_postings[i] means the docid (int) of the i-th associated posting
+    tweight_postings: List[
+        float
+    ]  # tweight_postings[i] means the term weight (float) of the i-th associated posting
 @dataclass
     nterms: int
     doc_texts: Optional[List[str]] = None
 def run_counting(
     documents: Iterable[Document],
     tokenize_fn: Callable[[str], List[str]] = simple_tokenize,
         doc_texts=doc_texts,
     )
 from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
 sciq = load_sciq()
 counting = run_counting(documents=iter(sciq.corpus), ndocs=len(sciq.corpus))
 from __future__ import annotations
 import math
+from dataclasses import dataclass
 from typing import Iterable, List, Optional, Type
 from nlp4web_codebase.ir.data_loaders.dm import Document
 @dataclass
 class BM25Index(InvertedIndex):
     @staticmethod
     def tokenize(text: str) -> List[str]:
         return simple_tokenize(text)
         )
         return index
 bm25_index = BM25Index.build_from_documents(
     documents=iter(sciq.corpus),
     ndocs=12160,
 )
 bm25_index.save("output/bm25_index")
 from abc import abstractmethod
+from typing import Type
+from nlp4web_codebase.ir.models import BaseRetriever
+class BaseInvertedIndexRetriever(BaseRetriever):
     @property
     @abstractmethod
     def index_class(self) -> Type[InvertedIndex]:
 class BM25Retriever(BaseInvertedIndexRetriever):
     @property
     def index_class(self) -> Type[BM25Index]:
         return BM25Index
 bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
+bm25_retriever.retrieve(
+    "What type of diseases occur when the immune system attacks normal body cells?"
+)
+plots_b = {
+    "X": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
+    "Y": [
+        0.694980045351474,
+        0.8126195011337869,
+        0.821528798185941,
+        0.8218562358276644,
+        0.8222244897959182,
+        0.8195024943310657,
+        0.8182163265306123,
+        0.8174734693877551,
+        0.8139020408163266,
+        0.8116893424036281,
+        0.8083002267573697,
+    ],
+}  # TODO: Replace
+plots_k1 = {
+    "X": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
+    "Y": [
+        0.7345419501133786,
+        0.7668607709750567,
+        0.779508843537415,
+        0.7900947845804988,
+        0.8015931972789115,
+        0.8103560090702948,
+        0.812374149659864,
+        0.8156743764172336,
+        0.8194036281179138,
+        0.8222244897959182,
+        0.8221800453514739,
+    ],
+}
 best_b = plots_b["X"][np.argmax(plots_b["Y"])]
 best_k1 = plots_k1["X"][np.argmax(plots_k1["Y"])]
     ndocs=12160,
     show_progress_bar=True,
     k1=best_k1,
+    b=best_b,
 )
 from typing import TypedDict
+import gradio as gr
 class Hit(TypedDict):
+    cid: str
+    score: float
+    text: str
 demo: Optional[gr.Interface] = None  # Assign your gradio demo to this variable
 return_type = List[Hit]
 ## YOUR_CODE_STARTS_HERE
+def retrieve(query: str, topk: int = 10) -> return_type:
     ranking = bm25_retriever.retrieve(query=query, topk=3)
     hits = []
     for cid, score in ranking.items():
         hits.append({"cid": cid, "score": score, "text": text})
     return hits
 demo = gr.Interface(
     fn=retrieve,
     inputs=gr.Textbox(lines=3, placeholder="Enter your query here..."),
         ["What are the differences between immunodeficiency and autoimmune diseases?"],
         ["What are the causes of immunodeficiency?"],
         ["What are the symptoms of immunodeficiency?"],
+    ],
 )
 ## YOUR_CODE_ENDS_HERE
+demo.launch()