Spaces:

twimbit-ai
/

top_questions

Runtime error

App Files Files

Siddhant commited on Jul 12, 2023

Commit

bd8cd5c

unverified ·

1 Parent(s): e597784

initial commit

Browse files

Files changed (9) hide show

.dockerignore +1 -0
.gitattributes +0 -35
.gitignore +129 -0
Dockerfile +14 -0
README.md +2 -10
main.py +36 -0
requirements.txt +7 -0
utils/GetDB.py +38 -0
utils/GetTopAndRecentQuestions.py +137 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ venv/

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,129 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/

Dockerfile ADDED Viewed

	@@ -0,0 +1,14 @@

+FROM python:3.10
+WORKDIR /app
+COPY . /app
+#RUN mkdir /app/data
+RUN pip install -r requirements.txt
+EXPOSE 80
+#CMD ["uvicorn" ,"main:app","--host","0.0.0.0","--port","80","--ssl-keyfile","./privkey.pem","--ssl-certfile","./fullchain.pem"]
+#CMD ["uvicorn" ,"main:app","--proxy-headers","--host","0.0.0.0","--port","80","--workers","5"]
+CMD ["uvicorn" ,"main:app","--proxy-headers","--host","0.0.0.0","--port","7860"]

README.md CHANGED Viewed

@@ -1,10 +1,2 @@
----
-title: Top Questions
-emoji: 🔥
-colorFrom: yellow
-colorTo: blue
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference


1	+ # twimbit-answer
2	+ A ChatGPT powered bot to answer queries related to posts content and formatted content

main.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from fastapi import FastAPI, HTTPException, Header, status, Body, Request
+from typing import Annotated
+import os
+from fastapi.middleware.cors import CORSMiddleware
+from dotenv import load_dotenv
+from utils.GetTopAndRecentQuestions import return_top_question
+load_dotenv()
+app = FastAPI(docs_url="/documentation", redoc_url=None)
+origins = [
+    "http://localhost:4200",
+    "https://releasepreview.twimbit.com"
+]
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=['*'],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+token = os.getenv("token")
+@app.get("/get_top_questions", status_code=status.HTTP_200_OK)
+def get_top_questions_(limit: int = 5, Authorization: Annotated[list[str] | None, Header()] = None):
+    if Authorization is None or Authorization[0] != "Bearer {}".format(token):
+        raise HTTPException(status_code=401, detail="Unauthorised.")
+    try:
+        # return {'data': return_top_question(limit)}
+        return {'data': 'success'}
+    except Exception as e:
+        # return a success message
+        raise HTTPException(status_code=400, detail=str(e))

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+fastapi==0.95.1
+psycopg2==2.9.6
+python-dotenv==1.0.0
+uvicorn==0.22.0
+tiktoken==0.3.3
+weaviate-client==3.19.2
+sentence_transformers

utils/GetDB.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import psycopg2.extras
+import os
+import psycopg2
+from psycopg2 import pool
+from dotenv import load_dotenv
+load_dotenv()
+class GetDB:
+    def __init__(self):
+        self.host = os.getenv("host")
+        self.database = os.getenv("database")
+        self.user = os.getenv("user")
+        self.password = os.getenv("password")
+        self.port = os.getenv("port")
+    def get_db_connection(self):
+        try:
+            postgreSQL_pool = psycopg2.pool.SimpleConnectionPool(1, 20, host=self.host,
+                                                                 database=self.database,
+                                                                 user=self.user,
+                                                                 password=self.password,
+                                                                 port=self.port)
+            # Use getconn() to Get Connection from connection pool
+            return postgreSQL_pool
+        except (Exception, psycopg2.DatabaseError) as error:
+            print("Error while connecting to PostgreSQL", error)
+        #
+        # finally:
+        #     # closing database connection.
+        #     # use closeall() method to close all the active connection if you want to turn of the application
+        #     if postgreSQL_pool:
+        #         postgreSQL_pool.closeall
+        #     print("PostgreSQL connection pool is closed")

utils/GetTopAndRecentQuestions.py ADDED Viewed

	@@ -0,0 +1,137 @@

+from sentence_transformers import SentenceTransformer, util
+import torch
+import difflib
+from utils.GetDB import GetDB
+postgreSQL_pool = GetDB().get_db_connection()
+embedder = SentenceTransformer('all-MiniLM-L6-v2')
+def get_question():
+    # Connect to the PostgreSQL database
+    conn = postgreSQL_pool.getconn()
+    # Create a cursor object
+    cur = conn.cursor()
+    # Execute a SELECT query to fetch data from the "users" table
+    cur.execute("SELECT question FROM chat_history ORDER BY created_at DESC")
+    # Fetch all the results as a list of tuples
+    results = cur.fetchall()
+    results = [x[0] for x in results]
+    # Close the cursor and connection
+    #     # cur.close()
+    postgreSQL_pool.putconn(conn)
+    return results
+def count_top_questions(questions_array):
+    corpus_embeddings = embedder.encode(questions_array, convert_to_tensor=True)
+    top_questions_array = {}
+    for question in questions_array:
+        query_embedding = embedder.encode([question], convert_to_tensor=True)
+        # We use cosine-similarity and torch.topk to find the highest 5 scores
+        cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
+        top_results = torch.topk(cos_scores, k=100)
+        counter = 0
+        for score, idx in zip(top_results[0][1:], top_results[1][1:]):
+            if score.item() >= 0.8:
+                counter += 1
+        top_questions_array[question] = counter
+    # removing duplicate tuples
+    return sorted(top_questions_array.items(), key=lambda x: x[1], reverse=True)[:50]
+def remove_redundancy(redundant_raw_top_asked_questions):
+    for raw_top_asked_question in redundant_raw_top_asked_questions:
+        for raw_top_asked_question_inner in redundant_raw_top_asked_questions:
+            matching_ratio = difflib.SequenceMatcher(None, raw_top_asked_question_inner[0],
+                                                     raw_top_asked_question[0]).ratio()
+            if 0.7 <= matching_ratio < 1.0:
+                redundant_raw_top_asked_questions.remove(raw_top_asked_question_inner)
+    return redundant_raw_top_asked_questions
+def remove_greetings(sanitised_questions_array):
+    greeting_array = ['hey', 'hi', 'hello', "Hello!",
+                      "Hi there!",
+                      "Hey!",
+                      "Good morning!",
+                      "Good afternoon!",
+                      "Good evening!",
+                      "Howdy!",
+                      "Greetings!",
+                      "Nice to see you!",
+                      "What's up?",
+                      "Hi!",
+                      "hiiii",
+                      "Hello!",
+                      "Hey!", "How are you?",
+                      "What is your name?",
+                      "Where are you from?",
+                      "What do you do?",
+                      "How can I help you?",
+                      "What's the weather like?",
+                      "Do you have any plans for the weekend?",
+                      "Have you seen any good movies lately?",
+                      "What's your favorite food?",
+                      "What are your hobbies?", "hi, hello"]
+    greetings_embeddings = embedder.encode(greeting_array, convert_to_tensor=True)
+    for raw_top_asked_question in sanitised_questions_array[:10]:
+        query_embedding = embedder.encode([raw_top_asked_question[0]], convert_to_tensor=True)
+        cos_scores = util.cos_sim(query_embedding, greetings_embeddings)[0]
+        top_results = torch.topk(cos_scores, k=1)
+        for score, idx in zip(top_results[0], top_results[1]):
+            if score.item() >= 0.87:
+                sanitised_questions_array.remove(raw_top_asked_question)
+    return sanitised_questions_array
+def final_phase_filtering(raw_first_phase_filtered_questions, limit=20):
+    raw_first_phase_filtered_questions = raw_first_phase_filtered_questions[:limit]
+    for raw_first_phase_filtered_question in raw_first_phase_filtered_questions:
+        for raw_first_phase_filtered_question_inner in raw_first_phase_filtered_questions:
+            emb1 = embedder.encode(raw_first_phase_filtered_question[0])
+            emb2 = embedder.encode(raw_first_phase_filtered_question_inner[0])
+            cos_sim = util.cos_sim(emb1, emb2)
+            if 0.85 <= cos_sim.item() < 1.0000001192092896:
+                raw_first_phase_filtered_questions.remove(raw_first_phase_filtered_question_inner)
+    return raw_first_phase_filtered_questions
+def return_top_question(limit=5):
+    questions = get_question()
+    count_top_questions_ = count_top_questions(questions)
+    remove_redundancy_ = remove_redundancy(count_top_questions_)
+    remove_greetings_ = remove_greetings(remove_redundancy_)
+    final_phase_filtering_ = final_phase_filtering(remove_greetings_)[:limit]
+    message = 'Top questions asked on the ask twimbit or on platform by users:'
+    for key, final_phase_filtering__ in enumerate(final_phase_filtering_):
+        message = message + '\n {}: '.format(key + 1) + final_phase_filtering__[0]
+    return message
+# print(return_top_question())