Siddhant commited on
Commit
bd8cd5c
·
unverified ·
1 Parent(s): e597784

initial commit

Browse files
Files changed (9) hide show
  1. .dockerignore +1 -0
  2. .gitattributes +0 -35
  3. .gitignore +129 -0
  4. Dockerfile +14 -0
  5. README.md +2 -10
  6. main.py +36 -0
  7. requirements.txt +7 -0
  8. utils/GetDB.py +38 -0
  9. utils/GetTopAndRecentQuestions.py +137 -0
.dockerignore ADDED
@@ -0,0 +1 @@
 
 
1
+ venv/
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
Dockerfile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ WORKDIR /app
4
+
5
+ COPY . /app
6
+ #RUN mkdir /app/data
7
+ RUN pip install -r requirements.txt
8
+
9
+ EXPOSE 80
10
+
11
+ #CMD ["uvicorn" ,"main:app","--host","0.0.0.0","--port","80","--ssl-keyfile","./privkey.pem","--ssl-certfile","./fullchain.pem"]
12
+
13
+ #CMD ["uvicorn" ,"main:app","--proxy-headers","--host","0.0.0.0","--port","80","--workers","5"]
14
+ CMD ["uvicorn" ,"main:app","--proxy-headers","--host","0.0.0.0","--port","7860"]
README.md CHANGED
@@ -1,10 +1,2 @@
1
- ---
2
- title: Top Questions
3
- emoji: 🔥
4
- colorFrom: yellow
5
- colorTo: blue
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # twimbit-answer
2
+ A ChatGPT powered bot to answer queries related to posts content and formatted content
 
 
 
 
 
 
 
 
main.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, Header, status, Body, Request
2
+ from typing import Annotated
3
+ import os
4
+ from fastapi.middleware.cors import CORSMiddleware
5
+ from dotenv import load_dotenv
6
+ from utils.GetTopAndRecentQuestions import return_top_question
7
+
8
+ load_dotenv()
9
+
10
+ app = FastAPI(docs_url="/documentation", redoc_url=None)
11
+
12
+ origins = [
13
+ "http://localhost:4200",
14
+ "https://releasepreview.twimbit.com"
15
+ ]
16
+
17
+ app.add_middleware(
18
+ CORSMiddleware,
19
+ allow_origins=['*'],
20
+ allow_credentials=True,
21
+ allow_methods=["*"],
22
+ allow_headers=["*"],
23
+ )
24
+ token = os.getenv("token")
25
+
26
+
27
+ @app.get("/get_top_questions", status_code=status.HTTP_200_OK)
28
+ def get_top_questions_(limit: int = 5, Authorization: Annotated[list[str] | None, Header()] = None):
29
+ if Authorization is None or Authorization[0] != "Bearer {}".format(token):
30
+ raise HTTPException(status_code=401, detail="Unauthorised.")
31
+ try:
32
+ # return {'data': return_top_question(limit)}
33
+ return {'data': 'success'}
34
+ except Exception as e:
35
+ # return a success message
36
+ raise HTTPException(status_code=400, detail=str(e))
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi==0.95.1
2
+ psycopg2==2.9.6
3
+ python-dotenv==1.0.0
4
+ uvicorn==0.22.0
5
+ tiktoken==0.3.3
6
+ weaviate-client==3.19.2
7
+ sentence_transformers
utils/GetDB.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import psycopg2.extras
2
+ import os
3
+ import psycopg2
4
+ from psycopg2 import pool
5
+ from dotenv import load_dotenv
6
+
7
+ load_dotenv()
8
+
9
+
10
+ class GetDB:
11
+ def __init__(self):
12
+ self.host = os.getenv("host")
13
+ self.database = os.getenv("database")
14
+ self.user = os.getenv("user")
15
+ self.password = os.getenv("password")
16
+ self.port = os.getenv("port")
17
+
18
+ def get_db_connection(self):
19
+
20
+ try:
21
+ postgreSQL_pool = psycopg2.pool.SimpleConnectionPool(1, 20, host=self.host,
22
+ database=self.database,
23
+ user=self.user,
24
+ password=self.password,
25
+ port=self.port)
26
+
27
+ # Use getconn() to Get Connection from connection pool
28
+ return postgreSQL_pool
29
+
30
+ except (Exception, psycopg2.DatabaseError) as error:
31
+ print("Error while connecting to PostgreSQL", error)
32
+ #
33
+ # finally:
34
+ # # closing database connection.
35
+ # # use closeall() method to close all the active connection if you want to turn of the application
36
+ # if postgreSQL_pool:
37
+ # postgreSQL_pool.closeall
38
+ # print("PostgreSQL connection pool is closed")
utils/GetTopAndRecentQuestions.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer, util
2
+ import torch
3
+ import difflib
4
+ from utils.GetDB import GetDB
5
+
6
+ postgreSQL_pool = GetDB().get_db_connection()
7
+
8
+ embedder = SentenceTransformer('all-MiniLM-L6-v2')
9
+
10
+
11
+ def get_question():
12
+ # Connect to the PostgreSQL database
13
+ conn = postgreSQL_pool.getconn()
14
+
15
+ # Create a cursor object
16
+ cur = conn.cursor()
17
+
18
+ # Execute a SELECT query to fetch data from the "users" table
19
+ cur.execute("SELECT question FROM chat_history ORDER BY created_at DESC")
20
+
21
+ # Fetch all the results as a list of tuples
22
+ results = cur.fetchall()
23
+ results = [x[0] for x in results]
24
+
25
+ # Close the cursor and connection
26
+ # # cur.close()
27
+ postgreSQL_pool.putconn(conn)
28
+ return results
29
+
30
+
31
+ def count_top_questions(questions_array):
32
+ corpus_embeddings = embedder.encode(questions_array, convert_to_tensor=True)
33
+
34
+ top_questions_array = {}
35
+
36
+ for question in questions_array:
37
+
38
+ query_embedding = embedder.encode([question], convert_to_tensor=True)
39
+
40
+ # We use cosine-similarity and torch.topk to find the highest 5 scores
41
+ cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
42
+ top_results = torch.topk(cos_scores, k=100)
43
+
44
+ counter = 0
45
+
46
+ for score, idx in zip(top_results[0][1:], top_results[1][1:]):
47
+ if score.item() >= 0.8:
48
+ counter += 1
49
+
50
+ top_questions_array[question] = counter
51
+
52
+ # removing duplicate tuples
53
+ return sorted(top_questions_array.items(), key=lambda x: x[1], reverse=True)[:50]
54
+
55
+
56
+ def remove_redundancy(redundant_raw_top_asked_questions):
57
+ for raw_top_asked_question in redundant_raw_top_asked_questions:
58
+
59
+ for raw_top_asked_question_inner in redundant_raw_top_asked_questions:
60
+ matching_ratio = difflib.SequenceMatcher(None, raw_top_asked_question_inner[0],
61
+ raw_top_asked_question[0]).ratio()
62
+
63
+ if 0.7 <= matching_ratio < 1.0:
64
+ redundant_raw_top_asked_questions.remove(raw_top_asked_question_inner)
65
+
66
+ return redundant_raw_top_asked_questions
67
+
68
+
69
+ def remove_greetings(sanitised_questions_array):
70
+ greeting_array = ['hey', 'hi', 'hello', "Hello!",
71
+ "Hi there!",
72
+ "Hey!",
73
+ "Good morning!",
74
+ "Good afternoon!",
75
+ "Good evening!",
76
+ "Howdy!",
77
+ "Greetings!",
78
+ "Nice to see you!",
79
+ "What's up?",
80
+ "Hi!",
81
+ "hiiii",
82
+ "Hello!",
83
+ "Hey!", "How are you?",
84
+ "What is your name?",
85
+ "Where are you from?",
86
+ "What do you do?",
87
+ "How can I help you?",
88
+ "What's the weather like?",
89
+ "Do you have any plans for the weekend?",
90
+ "Have you seen any good movies lately?",
91
+ "What's your favorite food?",
92
+ "What are your hobbies?", "hi, hello"]
93
+
94
+ greetings_embeddings = embedder.encode(greeting_array, convert_to_tensor=True)
95
+
96
+ for raw_top_asked_question in sanitised_questions_array[:10]:
97
+ query_embedding = embedder.encode([raw_top_asked_question[0]], convert_to_tensor=True)
98
+
99
+ cos_scores = util.cos_sim(query_embedding, greetings_embeddings)[0]
100
+ top_results = torch.topk(cos_scores, k=1)
101
+
102
+ for score, idx in zip(top_results[0], top_results[1]):
103
+ if score.item() >= 0.87:
104
+ sanitised_questions_array.remove(raw_top_asked_question)
105
+
106
+ return sanitised_questions_array
107
+
108
+
109
+ def final_phase_filtering(raw_first_phase_filtered_questions, limit=20):
110
+ raw_first_phase_filtered_questions = raw_first_phase_filtered_questions[:limit]
111
+ for raw_first_phase_filtered_question in raw_first_phase_filtered_questions:
112
+ for raw_first_phase_filtered_question_inner in raw_first_phase_filtered_questions:
113
+ emb1 = embedder.encode(raw_first_phase_filtered_question[0])
114
+ emb2 = embedder.encode(raw_first_phase_filtered_question_inner[0])
115
+
116
+ cos_sim = util.cos_sim(emb1, emb2)
117
+
118
+ if 0.85 <= cos_sim.item() < 1.0000001192092896:
119
+ raw_first_phase_filtered_questions.remove(raw_first_phase_filtered_question_inner)
120
+
121
+ return raw_first_phase_filtered_questions
122
+
123
+
124
+ def return_top_question(limit=5):
125
+ questions = get_question()
126
+ count_top_questions_ = count_top_questions(questions)
127
+ remove_redundancy_ = remove_redundancy(count_top_questions_)
128
+ remove_greetings_ = remove_greetings(remove_redundancy_)
129
+ final_phase_filtering_ = final_phase_filtering(remove_greetings_)[:limit]
130
+
131
+ message = 'Top questions asked on the ask twimbit or on platform by users:'
132
+ for key, final_phase_filtering__ in enumerate(final_phase_filtering_):
133
+ message = message + '\n {}: '.format(key + 1) + final_phase_filtering__[0]
134
+
135
+ return message
136
+
137
+ # print(return_top_question())