leo-pasi commited on
Commit
45e69ef
·
1 Parent(s): 5122ccf

first commit to deploy

Browse files
.flake8 ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ [flake8]
2
+ ignore = E203, W503
3
+ # matches black's default
4
+ max-line-length = 88
5
+ docstring-convention = numpy
6
+ per-file-ignores =
7
+ __init__.py:F401
8
+ exclude = .git, __pycache__, .venv
.gitignore ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # poetry
2
+ # Tt is generally recommended to include poetry.lock in version control.
3
+ # This is especially recommended for binary packages to ensure reproducibility,
4
+ # and is more commonly ignored for libraries.
5
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
6
+ #poetry.lock
7
+
8
+ # Environments
9
+ .env
10
+ .venv
11
+
12
+ __pycache__/
13
+
14
+ *.sqlite
.pre-commit-config.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ default_language_version:
2
+ python: python3.10
3
+
4
+ repos:
5
+ - repo: https://github.com/pre-commit/pre-commit-hooks
6
+ rev: v5.0.0
7
+ hooks:
8
+ - id: check-json
9
+ description: This hook checks json files for parseable syntax.
10
+ - id: check-yaml
11
+ description: This hook checks yaml files for parseable syntax.
12
+ - id: check-toml
13
+ description: This hook checks toml files for parseable syntax.
14
+ - id: check-ast
15
+ description: Simply check whether files parse as valid python.
16
+ - id: check-merge-conflict
17
+ description: Check for files that contain merge conflict strings.
18
+ - id: debug-statements
19
+ description: Check for debugger imports and py37+ `breakpoint()` calls in python source.
20
+ - id: end-of-file-fixer
21
+ description: Ensures that a file is either empty, or ends with one newline.
22
+ - id: name-tests-test
23
+ description: This verifies that test files are named correctly - test* format (prefix and not suffix)
24
+ args: ["--pytest-test-first"]
25
+ - id: trailing-whitespace
26
+ args: ["--markdown-linebreak-ext=md"]
27
+ description: This hook trims trailing whitespace.
28
+
29
+ - repo: https://github.com/asottile/pyupgrade
30
+ rev: v3.19.1
31
+ hooks:
32
+ - id: pyupgrade
33
+ args: [--py310-plus]
34
+
35
+ # Pre-commit is installed by poetry alongside black, isort, flake8
36
+ - repo: local
37
+ hooks:
38
+ - id: black
39
+ name: black
40
+ entry: poetry run black --config pyproject.toml
41
+ language: system
42
+ require_serial: true
43
+ types: [python]
44
+ - id: isort
45
+ name: isort
46
+ entry: poetry run isort --settings-path pyproject.toml
47
+ language: system
48
+ types: [python]
49
+ - id: flake8
50
+ name: flake8
51
+ entry: poetry run flake8 --config .flake8
52
+ language: system
53
+ types: [python]
README.md CHANGED
@@ -10,4 +10,4 @@ pinned: false
10
  short_description: RAG chatbot trained on my master thesis.
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
10
  short_description: RAG chatbot trained on my master thesis.
11
  ---
12
 
13
+ This is a private demo using my own OpenAI API key. Please use responsibly.
app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from mythesis_chatbot.rag_setup import (
4
+ SupportedRags,
5
+ automerging_retrieval_setup,
6
+ basic_rag_setup,
7
+ sentence_window_retrieval_setup,
8
+ )
9
+
10
+ input_file = "./data/Master_Thesis.pdf"
11
+ save_dir = "./data/indices/"
12
+
13
+ automerging_engine = automerging_retrieval_setup(
14
+ input_file=input_file,
15
+ save_dir=save_dir,
16
+ llm_openai_model="gpt-4o-mini",
17
+ embed_model="BAAI/bge-small-en-v1.5",
18
+ chunk_sizes=[2048, 512, 128],
19
+ similarity_top_k=6,
20
+ rerank_model="cross-encoder/ms-marco-MiniLM-L-2-v2",
21
+ rerank_top_n=2,
22
+ )
23
+
24
+ sentence_window_engine = sentence_window_retrieval_setup(
25
+ input_file=input_file,
26
+ save_dir=save_dir,
27
+ llm_openai_model="gpt-4o-mini",
28
+ embed_model="BAAI/bge-small-en-v1.5",
29
+ sentence_window_size=3,
30
+ similarity_top_k=6,
31
+ rerank_model="cross-encoder/ms-marco-MiniLM-L-2-v2",
32
+ rerank_top_n=2,
33
+ )
34
+
35
+ basic_engine = basic_rag_setup(
36
+ input_file=input_file,
37
+ save_dir=save_dir,
38
+ llm_openai_model="gpt-4o-mini",
39
+ embed_model="BAAI/bge-small-en-v1.5",
40
+ similarity_top_k=6,
41
+ rerank_model="cross-encoder/ms-marco-MiniLM-L-2-v2",
42
+ rerank_top_n=2,
43
+ )
44
+
45
+
46
+ def chat_bot(query: str, rag_mode: SupportedRags) -> str:
47
+ if rag_mode == "basic":
48
+ return basic_engine.query(query).response
49
+ if rag_mode == "auto-merging retrieval":
50
+ return automerging_engine.query(query).response
51
+ if rag_mode == "sentence window retrieval":
52
+ return sentence_window_engine.query(query).response
53
+
54
+
55
+ default_message = (
56
+ "Ask a about a topic that is discussed in my master thesis."
57
+ "E.g., what is epistemic uncertainty?"
58
+ )
59
+
60
+ gradio_app = gr.Interface(
61
+ fn=chat_bot,
62
+ inputs=[
63
+ gr.Textbox(placeholder=default_message),
64
+ gr.Dropdown(
65
+ choices=["basic", "sentence window retrieval", "auto-merging retrieval"],
66
+ label="RAG mode",
67
+ value="basic",
68
+ ),
69
+ ],
70
+ outputs=["text"],
71
+ )
72
+
73
+ if __name__ == "__main__":
74
+ gradio_app.launch()
data/indices/auto_merging/d5c92a9b2f/default__vector_store.json ADDED
The diff for this file is too large to render. See raw diff
 
data/indices/auto_merging/d5c92a9b2f/docstore.json ADDED
The diff for this file is too large to render. See raw diff
 
data/indices/auto_merging/d5c92a9b2f/graph_store.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"graph_dict": {}}
data/indices/auto_merging/d5c92a9b2f/image__vector_store.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"embedding_dict": {}, "text_id_to_ref_doc_id": {}, "metadata_dict": {}}
data/indices/auto_merging/d5c92a9b2f/index_store.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"index_store/data": {"01d4fc3b-cde7-4254-a776-0340613d6542": {"__type__": "vector_store", "__data__": "{\"index_id\": \"01d4fc3b-cde7-4254-a776-0340613d6542\", \"summary\": null, \"nodes_dict\": {\"127eed05-f4c0-492c-aebb-fe8ea02ca1e5\": \"127eed05-f4c0-492c-aebb-fe8ea02ca1e5\", \"2935ee7a-e93e-49cf-95c5-78bc8dff3e9a\": \"2935ee7a-e93e-49cf-95c5-78bc8dff3e9a\", \"4a4dbbb4-76fe-4fdd-a535-6b639d06fed1\": \"4a4dbbb4-76fe-4fdd-a535-6b639d06fed1\", \"4149f79f-033e-4947-89f4-d0f132e26664\": \"4149f79f-033e-4947-89f4-d0f132e26664\", \"70d2a7d7-8312-47b7-8e75-cdac5ba2e41c\": \"70d2a7d7-8312-47b7-8e75-cdac5ba2e41c\", \"6ba0ab8c-7d35-464d-b1a7-3e51a3305397\": \"6ba0ab8c-7d35-464d-b1a7-3e51a3305397\", \"f48df62c-d67c-4992-a458-b8b82f24a21e\": \"f48df62c-d67c-4992-a458-b8b82f24a21e\", \"d9b7d187-5420-4a21-ba94-b677e41b31c5\": \"d9b7d187-5420-4a21-ba94-b677e41b31c5\", \"d18aaf46-78ba-4ac4-b2d8-04877df01f04\": \"d18aaf46-78ba-4ac4-b2d8-04877df01f04\", \"823db52f-3c7c-49f2-b06d-6c592db7ef42\": \"823db52f-3c7c-49f2-b06d-6c592db7ef42\", \"774f5af1-b593-47eb-9d6a-354c538e08f3\": \"774f5af1-b593-47eb-9d6a-354c538e08f3\", \"b0a8bf13-67ba-44ed-b2f8-ba822e7faadb\": \"b0a8bf13-67ba-44ed-b2f8-ba822e7faadb\", \"e2dfa4a4-b378-4b62-84f7-bef1ec7ceb0f\": \"e2dfa4a4-b378-4b62-84f7-bef1ec7ceb0f\", \"63f8f248-245f-4955-bedd-a2866caaed7a\": \"63f8f248-245f-4955-bedd-a2866caaed7a\", \"fe153069-6018-4026-9d8a-b4685d64ed38\": \"fe153069-6018-4026-9d8a-b4685d64ed38\", \"9c5c5819-4547-48ff-8f13-f22d11438a3a\": \"9c5c5819-4547-48ff-8f13-f22d11438a3a\", \"c6fb2fef-6474-47b8-9092-b026ab781d2b\": \"c6fb2fef-6474-47b8-9092-b026ab781d2b\", \"ecf9695e-b641-4b46-a467-0bd43708cbe0\": \"ecf9695e-b641-4b46-a467-0bd43708cbe0\", \"d724a087-a3d0-4009-bd38-b806365de8c7\": \"d724a087-a3d0-4009-bd38-b806365de8c7\", \"63e59f20-3737-45d8-9a92-d32c74bb7166\": \"63e59f20-3737-45d8-9a92-d32c74bb7166\", \"066b93a5-3342-4cc4-8571-98cadb2a3164\": \"066b93a5-3342-4cc4-8571-98cadb2a3164\", \"58ad19b7-324c-4be6-a9b8-3fb64690c7bb\": \"58ad19b7-324c-4be6-a9b8-3fb64690c7bb\", \"9922a1b5-0c0b-4c9f-bfe5-b353886f674a\": \"9922a1b5-0c0b-4c9f-bfe5-b353886f674a\", \"1a2a3619-bfd3-4b5d-8688-08beca93b771\": \"1a2a3619-bfd3-4b5d-8688-08beca93b771\", \"fd961f40-ff40-4c4d-9fc2-ae381ad4b060\": \"fd961f40-ff40-4c4d-9fc2-ae381ad4b060\", \"e99348f4-24f8-4709-b5b5-311c923a356a\": \"e99348f4-24f8-4709-b5b5-311c923a356a\", \"fa140f79-d1f4-444a-b07c-c3c2b00bcb6a\": \"fa140f79-d1f4-444a-b07c-c3c2b00bcb6a\", \"9071c131-fcde-4881-9954-dfdd8b1909a3\": \"9071c131-fcde-4881-9954-dfdd8b1909a3\", \"4dec264a-3556-4369-a51f-fa460f2e3e67\": \"4dec264a-3556-4369-a51f-fa460f2e3e67\", \"88b2e2ea-51e1-407b-a3db-6120ddb7a953\": \"88b2e2ea-51e1-407b-a3db-6120ddb7a953\", \"612e4522-6b23-4b77-a1a6-cc15d93cbfad\": \"612e4522-6b23-4b77-a1a6-cc15d93cbfad\", \"b1a24e8a-e7a6-490c-92fb-bf690a24a9cc\": \"b1a24e8a-e7a6-490c-92fb-bf690a24a9cc\", \"b4e16def-e018-4af7-9216-35a9f688b90e\": \"b4e16def-e018-4af7-9216-35a9f688b90e\", \"ba7f5982-2eee-425b-813b-c17de4d0bf65\": \"ba7f5982-2eee-425b-813b-c17de4d0bf65\", \"8159166f-ff8a-4af8-9c75-6af7349664af\": \"8159166f-ff8a-4af8-9c75-6af7349664af\", \"507c5eea-78f1-422c-a6d3-a187e4291c1f\": \"507c5eea-78f1-422c-a6d3-a187e4291c1f\", \"f78e2e16-decf-4534-bbd2-c0aab0f37021\": \"f78e2e16-decf-4534-bbd2-c0aab0f37021\", \"9fdd0b73-5596-42cb-9361-ac375d6f0415\": \"9fdd0b73-5596-42cb-9361-ac375d6f0415\", \"b62c885a-a745-427f-b3a1-5fb1988990bc\": \"b62c885a-a745-427f-b3a1-5fb1988990bc\", \"c313afad-778d-4960-b5ae-858e82effd8d\": \"c313afad-778d-4960-b5ae-858e82effd8d\", \"a6693f1e-bd6a-46b9-adc4-056ea375dab0\": \"a6693f1e-bd6a-46b9-adc4-056ea375dab0\", \"6b4320f6-7f58-47bd-82b4-42567bcb1bc8\": \"6b4320f6-7f58-47bd-82b4-42567bcb1bc8\", \"0a2e62df-7f45-47ee-9a98-86d449d32648\": \"0a2e62df-7f45-47ee-9a98-86d449d32648\", \"676f69bd-4650-4c90-a9fd-253354c2fd06\": \"676f69bd-4650-4c90-a9fd-253354c2fd06\", \"ea0f952a-51a3-4033-8131-4a6f189c4cba\": \"ea0f952a-51a3-4033-8131-4a6f189c4cba\", \"c9728fd6-54ea-4466-b7e4-71bf94f2db36\": \"c9728fd6-54ea-4466-b7e4-71bf94f2db36\", \"33a22272-74c1-4053-899b-f52ee8721489\": \"33a22272-74c1-4053-899b-f52ee8721489\", \"4e54f3b3-3cec-4266-9bfd-b7ba95ce2939\": \"4e54f3b3-3cec-4266-9bfd-b7ba95ce2939\", \"b11f795c-9011-4923-9346-e9949df69467\": \"b11f795c-9011-4923-9346-e9949df69467\", \"55eec99b-78ea-4843-af4a-46d61411510b\": \"55eec99b-78ea-4843-af4a-46d61411510b\", \"1b591c75-28e9-440c-8e65-5eaf1b979fc5\": \"1b591c75-28e9-440c-8e65-5eaf1b979fc5\", \"fd91c70a-adcb-4a30-9a73-73a17707624d\": \"fd91c70a-adcb-4a30-9a73-73a17707624d\", \"cc12d5e5-c058-4c6e-97f4-232c2f91af7a\": \"cc12d5e5-c058-4c6e-97f4-232c2f91af7a\", \"5713b650-3c69-4369-b4a1-25e79a3f4b4b\": \"5713b650-3c69-4369-b4a1-25e79a3f4b4b\", \"6216ecc3-4580-4861-905a-51f0c233a993\": \"6216ecc3-4580-4861-905a-51f0c233a993\", \"202ed815-c9f3-4180-976f-2b3d0fc99b3a\": \"202ed815-c9f3-4180-976f-2b3d0fc99b3a\", \"fc45659e-9e72-42e5-b1f5-e964f6474119\": \"fc45659e-9e72-42e5-b1f5-e964f6474119\", \"57d310ce-6e84-4805-b3ee-4137f01377e4\": \"57d310ce-6e84-4805-b3ee-4137f01377e4\", \"da804d94-7c21-4a4b-91f5-00ba625e411f\": \"da804d94-7c21-4a4b-91f5-00ba625e411f\", \"ae3e487e-3473-4c6c-a5bc-d6cec3cb8565\": \"ae3e487e-3473-4c6c-a5bc-d6cec3cb8565\", \"086f66d3-b416-41bd-af49-179420725658\": \"086f66d3-b416-41bd-af49-179420725658\", \"45da310f-48f1-4072-930a-ed4abccd4067\": \"45da310f-48f1-4072-930a-ed4abccd4067\", \"ebae6ed2-50e2-4e10-8c43-8d892eece566\": \"ebae6ed2-50e2-4e10-8c43-8d892eece566\", \"23c1c50a-828e-403c-ae44-4fcba39d6197\": \"23c1c50a-828e-403c-ae44-4fcba39d6197\", \"8d13191d-7fa0-401f-909f-e761626681de\": \"8d13191d-7fa0-401f-909f-e761626681de\", \"5f40ffb2-adf0-45f0-8837-0edd1db895a5\": \"5f40ffb2-adf0-45f0-8837-0edd1db895a5\", \"1d25b9e7-defd-4782-b095-1562832e7d21\": \"1d25b9e7-defd-4782-b095-1562832e7d21\", \"9013bbb3-cb21-4429-99cc-e8c7426f10fb\": \"9013bbb3-cb21-4429-99cc-e8c7426f10fb\", \"978359a5-1e65-4640-8a9f-343a2a59698d\": \"978359a5-1e65-4640-8a9f-343a2a59698d\", \"30c71308-f900-4bae-8ec6-d86d4dd13419\": \"30c71308-f900-4bae-8ec6-d86d4dd13419\", \"9819cc86-9766-441a-aa8c-40e186e24fea\": \"9819cc86-9766-441a-aa8c-40e186e24fea\", \"5c6de902-b23a-4c7e-855f-cd5eb652add2\": \"5c6de902-b23a-4c7e-855f-cd5eb652add2\", \"bb921f3b-6a8e-48ca-b4b5-e599a6d6c406\": \"bb921f3b-6a8e-48ca-b4b5-e599a6d6c406\", \"d4c91e89-461c-4294-ace0-23d5ce6747c0\": \"d4c91e89-461c-4294-ace0-23d5ce6747c0\", \"8f9fd36f-b761-4f40-a66c-6c4f75d4207b\": \"8f9fd36f-b761-4f40-a66c-6c4f75d4207b\", \"15c3b984-ff09-4f68-9d55-fe705c4f4ae4\": \"15c3b984-ff09-4f68-9d55-fe705c4f4ae4\", \"0526c5ab-8527-42b1-b1d8-1cfe710874f4\": \"0526c5ab-8527-42b1-b1d8-1cfe710874f4\", \"d4aa2f5a-33a8-453a-a6ad-c4bacf576fab\": \"d4aa2f5a-33a8-453a-a6ad-c4bacf576fab\", \"0ccd7498-9105-4204-bafd-6edc9cf75849\": \"0ccd7498-9105-4204-bafd-6edc9cf75849\", \"bd6b344e-d289-4006-a983-d9ffff88650f\": \"bd6b344e-d289-4006-a983-d9ffff88650f\", \"1fb579b0-32b1-4482-93af-f8335ef3d5e9\": \"1fb579b0-32b1-4482-93af-f8335ef3d5e9\", \"ab58e4bc-757c-4b83-8279-13bf36e223f4\": \"ab58e4bc-757c-4b83-8279-13bf36e223f4\", \"d6db2978-cf10-4fd6-949e-fc950bbf2598\": \"d6db2978-cf10-4fd6-949e-fc950bbf2598\", \"c2191d8c-3f02-4542-a078-164beec931a8\": \"c2191d8c-3f02-4542-a078-164beec931a8\", \"ca4bf718-63a5-4fb5-b024-7c448c880f95\": \"ca4bf718-63a5-4fb5-b024-7c448c880f95\", \"faf19732-15b6-4344-8678-a4b2c6a85dd1\": \"faf19732-15b6-4344-8678-a4b2c6a85dd1\", \"dddf8bf2-5179-4791-a310-7710be80004b\": \"dddf8bf2-5179-4791-a310-7710be80004b\", \"113bc7b3-5f99-45ca-b093-0b867520f051\": \"113bc7b3-5f99-45ca-b093-0b867520f051\", \"c6d6677b-f019-4c31-a8d0-503541971456\": \"c6d6677b-f019-4c31-a8d0-503541971456\", \"6b031f5f-0e40-4434-8ce5-d1e0ba08f7a0\": \"6b031f5f-0e40-4434-8ce5-d1e0ba08f7a0\", \"66cd63ad-1f7e-415f-99fb-100448af6aa2\": \"66cd63ad-1f7e-415f-99fb-100448af6aa2\", \"78390fc6-05b5-471f-9ed8-a2f24c6d53a8\": \"78390fc6-05b5-471f-9ed8-a2f24c6d53a8\", \"10d7b5e0-3e25-416f-bc00-2a01933c6b9b\": \"10d7b5e0-3e25-416f-bc00-2a01933c6b9b\", \"521b4f69-fc53-4ee1-aa3b-9a64d4689aab\": \"521b4f69-fc53-4ee1-aa3b-9a64d4689aab\", \"a63ebc03-cc47-4b22-bcbe-b26c328d8ad4\": \"a63ebc03-cc47-4b22-bcbe-b26c328d8ad4\", \"05cd4e21-ea82-46c9-9edb-0ce5101680e8\": \"05cd4e21-ea82-46c9-9edb-0ce5101680e8\", \"9ab01bf2-af0f-4b17-9232-46eca13fd2cc\": \"9ab01bf2-af0f-4b17-9232-46eca13fd2cc\", \"b8885c9f-be74-4eac-b33e-6388fca0496f\": \"b8885c9f-be74-4eac-b33e-6388fca0496f\", \"f6111520-529e-4fde-8438-3bdb22626f29\": \"f6111520-529e-4fde-8438-3bdb22626f29\", \"305361e0-447c-4b9a-bd75-cb60ce8e508a\": \"305361e0-447c-4b9a-bd75-cb60ce8e508a\", \"6d55016e-f6c2-4b28-9228-eae4e5982a3d\": \"6d55016e-f6c2-4b28-9228-eae4e5982a3d\", \"da53c1af-ff4e-4e59-a3b4-8ba2e6b62195\": \"da53c1af-ff4e-4e59-a3b4-8ba2e6b62195\", \"7cc0cbb7-3405-4440-bd3a-fe94b1d814f1\": \"7cc0cbb7-3405-4440-bd3a-fe94b1d814f1\", \"aa03814b-1151-47bf-b2c3-d5f728e0ed36\": \"aa03814b-1151-47bf-b2c3-d5f728e0ed36\", \"8fe63cef-848b-4b24-a35e-d09771946411\": \"8fe63cef-848b-4b24-a35e-d09771946411\", \"cfc29151-e033-4efa-8330-099a55348c4e\": \"cfc29151-e033-4efa-8330-099a55348c4e\", \"967d28de-3d2f-4905-8de3-660bf3740fca\": \"967d28de-3d2f-4905-8de3-660bf3740fca\", \"49bb4996-036c-48d4-9e30-5295bc7ac314\": \"49bb4996-036c-48d4-9e30-5295bc7ac314\", \"8fe6d34d-9c21-4656-86d3-732a702b4ec2\": \"8fe6d34d-9c21-4656-86d3-732a702b4ec2\", \"e09fef37-7df1-48a1-bfae-7d3cf85d77e9\": \"e09fef37-7df1-48a1-bfae-7d3cf85d77e9\", \"1efa753a-6d54-4d40-8bdf-0f52e6fea8b5\": \"1efa753a-6d54-4d40-8bdf-0f52e6fea8b5\", \"9d9f1478-1daf-40ec-8cd4-87f4fa12250a\": \"9d9f1478-1daf-40ec-8cd4-87f4fa12250a\", \"d77ea73b-146d-4d3a-b6b5-3f7d5bdca293\": \"d77ea73b-146d-4d3a-b6b5-3f7d5bdca293\", \"dfb85eda-0acd-4182-a136-c8692ca0a786\": \"dfb85eda-0acd-4182-a136-c8692ca0a786\", \"afd5e88c-7489-4f00-a9ad-44ecd97dbfb2\": \"afd5e88c-7489-4f00-a9ad-44ecd97dbfb2\", \"cb588028-5fd3-4f58-8384-ff9ebde0ae11\": \"cb588028-5fd3-4f58-8384-ff9ebde0ae11\", \"195ea30a-2725-4638-8edb-425d24fc565d\": \"195ea30a-2725-4638-8edb-425d24fc565d\", \"63e42cac-6151-43e7-85db-f41c374bb94b\": \"63e42cac-6151-43e7-85db-f41c374bb94b\", \"aac01576-0a03-407b-91cd-e222aecfd721\": \"aac01576-0a03-407b-91cd-e222aecfd721\", \"ab56bea2-c39a-444a-9434-dff308adbbea\": \"ab56bea2-c39a-444a-9434-dff308adbbea\", \"637776b7-4a00-4514-ae55-e2dfe5c640f6\": \"637776b7-4a00-4514-ae55-e2dfe5c640f6\", \"bbdf5750-fabd-4e11-b784-19a0b62b520b\": \"bbdf5750-fabd-4e11-b784-19a0b62b520b\", \"b10f58ab-250a-4f49-8497-16be51d652ee\": \"b10f58ab-250a-4f49-8497-16be51d652ee\", \"a0e4e6e7-b68e-4e73-b5e0-8d5d38b1bf70\": \"a0e4e6e7-b68e-4e73-b5e0-8d5d38b1bf70\", \"50d36569-0c7c-4cae-a1c0-6bb1c8990fc0\": \"50d36569-0c7c-4cae-a1c0-6bb1c8990fc0\", \"d8b051cc-f398-40c7-9326-433d79f1051d\": \"d8b051cc-f398-40c7-9326-433d79f1051d\", \"6859747b-0a8a-48c4-980f-6600405219ab\": \"6859747b-0a8a-48c4-980f-6600405219ab\", \"b682c9ff-86e2-42e3-a4e9-c188dd7cfb8f\": \"b682c9ff-86e2-42e3-a4e9-c188dd7cfb8f\", \"c4c3dd73-196b-4290-82dd-4046b9bb0ad8\": \"c4c3dd73-196b-4290-82dd-4046b9bb0ad8\", \"80817e53-55d3-43d3-83d1-a719a414926a\": \"80817e53-55d3-43d3-83d1-a719a414926a\", \"8e6e18c6-4421-4f3d-9290-ba27dfcaf95d\": \"8e6e18c6-4421-4f3d-9290-ba27dfcaf95d\", \"d71c30f3-4e73-4117-8927-b336191f9d1c\": \"d71c30f3-4e73-4117-8927-b336191f9d1c\", \"694102af-3409-4464-bb8d-19b1f381f253\": \"694102af-3409-4464-bb8d-19b1f381f253\", \"7a70e0d5-0a6d-4ae6-a4c2-e0290bc97d7d\": \"7a70e0d5-0a6d-4ae6-a4c2-e0290bc97d7d\", \"7a93bcb4-c2e9-4717-9c8b-2f99c1b5c97f\": \"7a93bcb4-c2e9-4717-9c8b-2f99c1b5c97f\", \"0faa8361-dc17-4fba-a3c1-eae0d76cc290\": \"0faa8361-dc17-4fba-a3c1-eae0d76cc290\", \"97b47867-e66b-4e98-bc15-fbde3135ba50\": \"97b47867-e66b-4e98-bc15-fbde3135ba50\", \"df6864be-1e32-4467-aeb8-81f7f38176e1\": \"df6864be-1e32-4467-aeb8-81f7f38176e1\", \"332fe951-d55a-44b7-b5a2-bfbeb1c57151\": \"332fe951-d55a-44b7-b5a2-bfbeb1c57151\", \"c0212a98-a9a5-4e81-abc4-7bcef93be942\": \"c0212a98-a9a5-4e81-abc4-7bcef93be942\", \"b42544bb-3912-482e-8047-ad5386607134\": \"b42544bb-3912-482e-8047-ad5386607134\", \"6feacb6d-c009-40f0-9392-dc2d6bd0cb7b\": \"6feacb6d-c009-40f0-9392-dc2d6bd0cb7b\", \"981cfbe8-bed1-4d15-97e9-dddfc96c1123\": \"981cfbe8-bed1-4d15-97e9-dddfc96c1123\", \"51b4e8a9-f9dd-4153-9ca8-36e2bb679353\": \"51b4e8a9-f9dd-4153-9ca8-36e2bb679353\", \"5b10e014-d531-451f-8425-864e25fa5558\": \"5b10e014-d531-451f-8425-864e25fa5558\", \"e44bddbb-d644-4ac6-8a1c-726ebf9627f4\": \"e44bddbb-d644-4ac6-8a1c-726ebf9627f4\", \"03f8aa5e-45cf-4f50-bd7e-d4f06ebefc23\": \"03f8aa5e-45cf-4f50-bd7e-d4f06ebefc23\", \"b7723603-864a-4164-9084-336d25f2890d\": \"b7723603-864a-4164-9084-336d25f2890d\", \"b2342ae1-ec3e-4d96-aaa4-1dfc15e8279b\": \"b2342ae1-ec3e-4d96-aaa4-1dfc15e8279b\", \"1c7a3a4e-db10-480e-9851-4b91640abc16\": \"1c7a3a4e-db10-480e-9851-4b91640abc16\", \"ef773976-8799-4129-979e-4d74984bd69d\": \"ef773976-8799-4129-979e-4d74984bd69d\", \"f85638bc-678e-4101-b240-962c6c1c8137\": \"f85638bc-678e-4101-b240-962c6c1c8137\", \"aba94add-6b16-4f14-9b0a-e5ec1ea041a0\": \"aba94add-6b16-4f14-9b0a-e5ec1ea041a0\", \"07e6b100-c219-485e-b418-d1d938ca0102\": \"07e6b100-c219-485e-b418-d1d938ca0102\", \"fced2909-7e86-41c3-924f-d06b644b76a7\": \"fced2909-7e86-41c3-924f-d06b644b76a7\", \"284789e6-68c0-4bf2-982c-288eaa266705\": \"284789e6-68c0-4bf2-982c-288eaa266705\", \"64b45aed-c073-4f5a-b33f-939104b0f21f\": \"64b45aed-c073-4f5a-b33f-939104b0f21f\", \"244967f0-6448-4a4f-9e2b-f96e26f682dd\": \"244967f0-6448-4a4f-9e2b-f96e26f682dd\", \"62e784ee-a4cf-474d-aabf-0044130aa013\": \"62e784ee-a4cf-474d-aabf-0044130aa013\", \"3f460913-76d7-44b2-820e-d4b1160f4b20\": \"3f460913-76d7-44b2-820e-d4b1160f4b20\", \"cb16543a-5ec6-4947-bb55-b9784e6e7c7c\": \"cb16543a-5ec6-4947-bb55-b9784e6e7c7c\", \"5235a666-1e07-4abc-8742-8e1bcf0e790d\": \"5235a666-1e07-4abc-8742-8e1bcf0e790d\", \"496a456f-2ec3-48cf-940f-602619bd2c6c\": \"496a456f-2ec3-48cf-940f-602619bd2c6c\", \"cb50208d-24e7-4954-972f-900604a9ab61\": \"cb50208d-24e7-4954-972f-900604a9ab61\", \"aaec41e4-2cd2-44d8-8c78-5ca1268a0780\": \"aaec41e4-2cd2-44d8-8c78-5ca1268a0780\", \"bae30186-8436-4dd3-ab19-b6c86c3d67c1\": \"bae30186-8436-4dd3-ab19-b6c86c3d67c1\", \"1fc0e546-5e5b-43fd-8efe-1aa0db4fad69\": \"1fc0e546-5e5b-43fd-8efe-1aa0db4fad69\"}, \"doc_id_dict\": {}, \"embeddings_dict\": {}}"}}}
data/indices/auto_merging/d5c92a9b2f/meta.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "doc_source": "Master_Thesis.pdf",
3
+ "embed_model": "BAAI/bge-small-en-v1.5",
4
+ "chunk_sizes": [
5
+ 2048,
6
+ 512,
7
+ 128
8
+ ]
9
+ }
data/indices/basic/default__vector_store.json ADDED
The diff for this file is too large to render. See raw diff
 
data/indices/basic/docstore.json ADDED
The diff for this file is too large to render. See raw diff
 
data/indices/basic/graph_store.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"graph_dict": {}}
data/indices/basic/image__vector_store.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"embedding_dict": {}, "text_id_to_ref_doc_id": {}, "metadata_dict": {}}
data/indices/basic/index_store.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"index_store/data": {"0765479e-c27c-4c88-b41f-dc98ed86f3a5": {"__type__": "vector_store", "__data__": "{\"index_id\": \"0765479e-c27c-4c88-b41f-dc98ed86f3a5\", \"summary\": null, \"nodes_dict\": {\"a31b64b9-d4c7-4ec6-8ca0-10d085d48205\": \"a31b64b9-d4c7-4ec6-8ca0-10d085d48205\", \"b17212a2-233f-492c-b53b-6a03c84d9f4f\": \"b17212a2-233f-492c-b53b-6a03c84d9f4f\", \"21f04a43-0875-4b86-94e4-93b11843969f\": \"21f04a43-0875-4b86-94e4-93b11843969f\", \"35cb1ff3-4a4f-4616-a7c1-af26f0b9d1e9\": \"35cb1ff3-4a4f-4616-a7c1-af26f0b9d1e9\", \"db73e86c-4c24-4992-b2c4-96c3cb23e27e\": \"db73e86c-4c24-4992-b2c4-96c3cb23e27e\", \"66023780-9b49-46a9-9204-8a4786f537e9\": \"66023780-9b49-46a9-9204-8a4786f537e9\", \"3ed814ba-a0a4-4328-802d-91780bc5964e\": \"3ed814ba-a0a4-4328-802d-91780bc5964e\", \"2ce891bb-b95c-457c-859e-7a8980eb98c2\": \"2ce891bb-b95c-457c-859e-7a8980eb98c2\", \"04a0e9e2-4399-4f8e-9bb7-4a92ca7f3dfa\": \"04a0e9e2-4399-4f8e-9bb7-4a92ca7f3dfa\", \"39a9562c-9040-4276-b6a3-0686a44bcf50\": \"39a9562c-9040-4276-b6a3-0686a44bcf50\", \"fe41da69-d1ee-4ce6-8888-34e9c9a759b5\": \"fe41da69-d1ee-4ce6-8888-34e9c9a759b5\", \"5e20f7b6-ac06-4a68-913b-419a2b585f5d\": \"5e20f7b6-ac06-4a68-913b-419a2b585f5d\", \"c0f8c088-1ff8-4839-a325-2522f014510a\": \"c0f8c088-1ff8-4839-a325-2522f014510a\", \"e29d9346-c846-4203-8c35-b8bb1e8fa481\": \"e29d9346-c846-4203-8c35-b8bb1e8fa481\", \"004767a1-85b2-466e-b117-b264a4667205\": \"004767a1-85b2-466e-b117-b264a4667205\", \"bd3c9a8c-f95b-478e-a4a2-01ed7f774d64\": \"bd3c9a8c-f95b-478e-a4a2-01ed7f774d64\", \"5084cc68-c3d7-4136-83c1-b8c48955719a\": \"5084cc68-c3d7-4136-83c1-b8c48955719a\", \"a1381f9d-d625-43ab-8869-28f2ec055ddd\": \"a1381f9d-d625-43ab-8869-28f2ec055ddd\", \"7e4d7d91-e79e-49e5-8c71-8a48fc38cb46\": \"7e4d7d91-e79e-49e5-8c71-8a48fc38cb46\", \"379dd8d5-bf98-4615-93a9-cba410df45ec\": \"379dd8d5-bf98-4615-93a9-cba410df45ec\"}, \"doc_id_dict\": {}, \"embeddings_dict\": {}}"}}}
data/indices/sentence_window/61a981e27b/default__vector_store.json ADDED
The diff for this file is too large to render. See raw diff
 
data/indices/sentence_window/61a981e27b/docstore.json ADDED
The diff for this file is too large to render. See raw diff
 
data/indices/sentence_window/61a981e27b/graph_store.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"graph_dict": {}}
data/indices/sentence_window/61a981e27b/image__vector_store.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"embedding_dict": {}, "text_id_to_ref_doc_id": {}, "metadata_dict": {}}
data/indices/sentence_window/61a981e27b/index_store.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"index_store/data": {"dcb26062-97ca-4980-98ba-3ac367ae9e38": {"__type__": "vector_store", "__data__": "{\"index_id\": \"dcb26062-97ca-4980-98ba-3ac367ae9e38\", \"summary\": null, \"nodes_dict\": {\"9ab5067f-1cba-4f14-b483-f13e4943eb6f\": \"9ab5067f-1cba-4f14-b483-f13e4943eb6f\", \"4b7406f1-4038-428d-a41b-ed15736a3ad2\": \"4b7406f1-4038-428d-a41b-ed15736a3ad2\", \"19f2c43a-c9ef-4afc-9607-38a5a898b98b\": \"19f2c43a-c9ef-4afc-9607-38a5a898b98b\", \"0fd054f0-bd92-4e92-8037-4fcc7393ef0c\": \"0fd054f0-bd92-4e92-8037-4fcc7393ef0c\", \"d898ce41-24a0-4315-9a8e-056fc948525e\": \"d898ce41-24a0-4315-9a8e-056fc948525e\", \"cd43f204-6cd3-414d-82b8-bcbde2cceace\": \"cd43f204-6cd3-414d-82b8-bcbde2cceace\", \"dcc52b5f-0e96-4574-a39a-ab928669c990\": \"dcc52b5f-0e96-4574-a39a-ab928669c990\", \"5df6f5f8-cc1d-408c-b9a1-85e01ec94403\": \"5df6f5f8-cc1d-408c-b9a1-85e01ec94403\", \"2521c213-0049-4cc6-875f-a868a9ca718e\": \"2521c213-0049-4cc6-875f-a868a9ca718e\", \"12ee176d-2954-4255-ab0a-65a6b867ea3a\": \"12ee176d-2954-4255-ab0a-65a6b867ea3a\", \"3e0d57bd-2b67-4083-b6b4-566dd1f91eab\": \"3e0d57bd-2b67-4083-b6b4-566dd1f91eab\", \"aa04a66d-6118-40d8-8290-3cf7d233ebad\": \"aa04a66d-6118-40d8-8290-3cf7d233ebad\", \"0a5e52be-600c-465e-aad7-10b04c9a7c8a\": \"0a5e52be-600c-465e-aad7-10b04c9a7c8a\", \"489b851b-8e96-4f53-a3bb-32fadb4ccbf0\": \"489b851b-8e96-4f53-a3bb-32fadb4ccbf0\", \"75c6b57b-009a-4b1d-9099-f188ecd51a0a\": \"75c6b57b-009a-4b1d-9099-f188ecd51a0a\", \"6ac79648-9763-4226-ad60-ac9f6ce20778\": \"6ac79648-9763-4226-ad60-ac9f6ce20778\", \"45842347-a5ac-43ac-b405-f77ace8a56a7\": \"45842347-a5ac-43ac-b405-f77ace8a56a7\", \"26cefadf-7040-43ea-9003-a1a4ad189372\": \"26cefadf-7040-43ea-9003-a1a4ad189372\", \"b6cbbc8a-2737-4272-a9d4-2f608931f1df\": \"b6cbbc8a-2737-4272-a9d4-2f608931f1df\", \"b5e2bdd2-6e23-4ccc-9bbd-4f08b831e046\": \"b5e2bdd2-6e23-4ccc-9bbd-4f08b831e046\"}, \"doc_id_dict\": {}, \"embeddings_dict\": {}}"}}}
data/indices/sentence_window/61a981e27b/meta.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "doc_source": "Master_Thesis.pdf",
3
+ "embed_model": "BAAI/bge-small-en-v1.5",
4
+ "sentence_window_size": 3
5
+ }
pyproject.toml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "mythesis-chatbot"
3
+ version = "0.1.0"
4
+ description = "RAG chatbot trained on my master thesis."
5
+ authors = [
6
+ {name = "Léonard Pasi",email = "leonardpasi@gmail.com"}
7
+ ]
8
+ readme = "README.md"
9
+ requires-python = "^3.10"
10
+ dependencies = [
11
+ "trulens (>=1.4.7,<2.0.0)",
12
+ "llama-index (>=0.12.28,<0.13.0)",
13
+ "pandas (>=2.2.3,<3.0.0)",
14
+ "llama-index-embeddings-huggingface (>=0.5.2,<0.6.0)",
15
+ "trulens-providers-openai (>=1.4.8,<2.0.0)",
16
+ "trulens-apps-llamaindex (>=1.4.8,<2.0.0)",
17
+ "gradio (==5.24.0)",
18
+ ]
19
+
20
+ [tool.poetry]
21
+ packages = [{include = "mythesis_chatbot", from = "src"}]
22
+
23
+ [tool.poetry.group.dev.dependencies]
24
+ pre-commit = "^4.2.0"
25
+ flake8 = "^7.2.0"
26
+ black = "^25.1.0"
27
+ isort = "^6.0.1"
28
+
29
+ [tool.black]
30
+ line-length = 88
31
+ exclude = '''
32
+ /(
33
+ \.git
34
+ | \.mypy_cache
35
+ | \.tox
36
+ | \.venv
37
+ | _build
38
+ | buck-out
39
+ | build
40
+ | dist
41
+ )/
42
+ '''
43
+
44
+ [tool.isort]
45
+ profile="black"
46
+ known_first_party=["mythesis_chatbot"]
47
+ known_third_party=["llamaindex", "trulens"]
48
+
49
+ [build-system]
50
+ requires = ["poetry-core>=2.0.0,<3.0.0"]
51
+ build-backend = "poetry.core.masonry.api"
requirements.txt ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohappyeyeballs==2.6.1 ; python_version >= "3.10" and python_version < "4.0"
2
+ aiohttp==3.11.16 ; python_version >= "3.10" and python_version < "4.0"
3
+ aiosignal==1.3.2 ; python_version >= "3.10" and python_version < "4.0"
4
+ alembic==1.15.2 ; python_version >= "3.10" and python_version < "4.0"
5
+ altair==5.5.0 ; python_version >= "3.10" and python_version < "4.0"
6
+ annotated-types==0.7.0 ; python_version >= "3.10" and python_version < "4.0"
7
+ anyio==4.9.0 ; python_version >= "3.10" and python_version < "4.0"
8
+ appnope==0.1.4 ; python_version >= "3.10" and python_version < "4.0" and platform_system == "Darwin"
9
+ argon2-cffi-bindings==21.2.0 ; python_version >= "3.10" and python_version < "4.0"
10
+ argon2-cffi==23.1.0 ; python_version >= "3.10" and python_version < "4.0"
11
+ arrow==1.3.0 ; python_version >= "3.10" and python_version < "4.0"
12
+ asttokens==3.0.0 ; python_version >= "3.10" and python_version < "4.0"
13
+ async-lru==2.0.5 ; python_version >= "3.10" and python_version < "4.0"
14
+ async-timeout==4.0.3 ; python_version == "3.10"
15
+ attrs==25.3.0 ; python_version >= "3.10" and python_version < "4.0"
16
+ babel==2.17.0 ; python_version >= "3.10" and python_version < "4.0"
17
+ banks==2.1.1 ; python_version >= "3.10" and python_version < "4.0"
18
+ beautifulsoup4==4.13.3 ; python_version >= "3.10" and python_version < "4.0"
19
+ bleach==6.2.0 ; python_version >= "3.10" and python_version < "4.0"
20
+ blinker==1.9.0 ; python_version >= "3.10" and python_version < "4.0"
21
+ build==1.2.2.post1 ; python_version >= "3.10" and python_version < "4.0"
22
+ cachecontrol==0.14.2 ; python_version >= "3.10" and python_version < "4.0"
23
+ cachetools==5.5.2 ; python_version >= "3.10" and python_version < "4.0"
24
+ certifi==2025.1.31 ; python_version >= "3.10" and python_version < "4.0"
25
+ cffi==1.17.1 ; python_version >= "3.10" and python_version < "4.0"
26
+ charset-normalizer==3.4.1 ; python_version >= "3.10" and python_version < "4.0"
27
+ cleo==2.1.0 ; python_version >= "3.10" and python_version < "4.0"
28
+ click==8.1.8 ; python_version >= "3.10" and python_version < "4.0"
29
+ colorama==0.4.6 ; python_version >= "3.10" and python_version < "4.0"
30
+ comm==0.2.2 ; python_version >= "3.10" and python_version < "4.0"
31
+ crashtest==0.4.1 ; python_version >= "3.10" and python_version < "4.0"
32
+ cryptography==44.0.2 ; python_version >= "3.10" and python_version < "4.0" and sys_platform == "linux"
33
+ dataclasses-json==0.6.7 ; python_version >= "3.10" and python_version < "4.0"
34
+ debugpy==1.8.13 ; python_version >= "3.10" and python_version < "4.0"
35
+ decorator==5.2.1 ; python_version >= "3.10" and python_version < "4.0"
36
+ defusedxml==0.7.1 ; python_version >= "3.10" and python_version < "4.0"
37
+ deprecated==1.2.18 ; python_version >= "3.10" and python_version < "4.0"
38
+ dill==0.3.9 ; python_version >= "3.10" and python_version < "4.0"
39
+ dirtyjson==1.0.8 ; python_version >= "3.10" and python_version < "4.0"
40
+ distlib==0.3.9 ; python_version >= "3.10" and python_version < "4.0"
41
+ distro==1.9.0 ; python_version >= "3.10" and python_version < "4.0"
42
+ dulwich==0.21.7 ; python_version >= "3.10" and python_version < "4.0"
43
+ exceptiongroup==1.2.2 ; python_version == "3.10"
44
+ executing==2.2.0 ; python_version >= "3.10" and python_version < "4.0"
45
+ fastjsonschema==2.21.1 ; python_version >= "3.10" and python_version < "4.0"
46
+ filelock==3.18.0 ; python_version >= "3.10" and python_version < "4.0"
47
+ filetype==1.2.0 ; python_version >= "3.10" and python_version < "4.0"
48
+ fqdn==1.5.1 ; python_version >= "3.10" and python_version < "4.0"
49
+ frozenlist==1.5.0 ; python_version >= "3.10" and python_version < "4.0"
50
+ fsspec==2025.3.2 ; python_version >= "3.10" and python_version < "4.0"
51
+ gitdb==4.0.12 ; python_version >= "3.10" and python_version < "4.0"
52
+ gitpython==3.1.44 ; python_version >= "3.10" and python_version < "4.0"
53
+ greenlet==3.1.1 ; python_version >= "3.10" and python_version <= "3.13" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32")
54
+ griffe==1.7.2 ; python_version >= "3.10" and python_version < "4.0"
55
+ h11==0.14.0 ; python_version >= "3.10" and python_version < "4.0"
56
+ httpcore==1.0.7 ; python_version >= "3.10" and python_version < "4.0"
57
+ httpx-sse==0.4.0 ; python_version >= "3.10" and python_version < "4.0"
58
+ httpx==0.28.1 ; python_version >= "3.10" and python_version < "4.0"
59
+ huggingface-hub==0.30.1 ; python_version >= "3.10" and python_version < "4.0"
60
+ idna==3.10 ; python_version >= "3.10" and python_version < "4.0"
61
+ importlib-metadata==8.6.1 ; python_version >= "3.10" and python_version < "4.0"
62
+ importlib-resources==6.5.2 ; python_version >= "3.10" and python_version < "4.0"
63
+ installer==0.7.0 ; python_version >= "3.10" and python_version < "4.0"
64
+ ipykernel==6.29.5 ; python_version >= "3.10" and python_version < "4.0"
65
+ ipython-pygments-lexers==1.1.1 ; python_version >= "3.11" and python_version < "4.0"
66
+ ipython==8.34.0 ; python_version == "3.10"
67
+ ipython==9.0.2 ; python_version >= "3.11" and python_version < "4.0"
68
+ ipywidgets==8.1.5 ; python_version >= "3.10" and python_version < "4.0"
69
+ isoduration==20.11.0 ; python_version >= "3.10" and python_version < "4.0"
70
+ jaraco-classes==3.4.0 ; python_version >= "3.10" and python_version < "4.0"
71
+ jedi==0.19.2 ; python_version >= "3.10" and python_version < "4.0"
72
+ jeepney==0.9.0 ; python_version >= "3.10" and python_version < "4.0" and sys_platform == "linux"
73
+ jinja2==3.1.6 ; python_version >= "3.10" and python_version < "4.0"
74
+ jiter==0.9.0 ; python_version >= "3.10" and python_version < "4.0"
75
+ joblib==1.4.2 ; python_version >= "3.10" and python_version < "4.0"
76
+ json5==0.11.0 ; python_version >= "3.10" and python_version < "4.0"
77
+ jsonpatch==1.33 ; python_version >= "3.10" and python_version < "4.0"
78
+ jsonpointer==3.0.0 ; python_version >= "3.10" and python_version < "4.0"
79
+ jsonschema-specifications==2024.10.1 ; python_version >= "3.10" and python_version < "4.0"
80
+ jsonschema==4.23.0 ; python_version >= "3.10" and python_version < "4.0"
81
+ jupyter-client==8.6.3 ; python_version >= "3.10" and python_version < "4.0"
82
+ jupyter-console==6.6.3 ; python_version >= "3.10" and python_version < "4.0"
83
+ jupyter-core==5.7.2 ; python_version >= "3.10" and python_version < "4.0"
84
+ jupyter-events==0.12.0 ; python_version >= "3.10" and python_version < "4.0"
85
+ jupyter-lsp==2.2.5 ; python_version >= "3.10" and python_version < "4.0"
86
+ jupyter-server-terminals==0.5.3 ; python_version >= "3.10" and python_version < "4.0"
87
+ jupyter-server==2.15.0 ; python_version >= "3.10" and python_version < "4.0"
88
+ jupyter==1.1.1 ; python_version >= "3.10" and python_version < "4.0"
89
+ jupyterlab-pygments==0.3.0 ; python_version >= "3.10" and python_version < "4.0"
90
+ jupyterlab-server==2.27.3 ; python_version >= "3.10" and python_version < "4.0"
91
+ jupyterlab-widgets==3.0.13 ; python_version >= "3.10" and python_version < "4.0"
92
+ jupyterlab==4.3.6 ; python_version >= "3.10" and python_version < "4.0"
93
+ keyring==24.3.1 ; python_version >= "3.10" and python_version < "4.0"
94
+ langchain-community==0.3.21 ; python_version >= "3.10" and python_version < "4.0"
95
+ langchain-core==0.3.51 ; python_version >= "3.10" and python_version < "4.0"
96
+ langchain-text-splitters==0.3.8 ; python_version >= "3.10" and python_version < "4.0"
97
+ langchain==0.3.23 ; python_version >= "3.10" and python_version < "4.0"
98
+ langsmith==0.3.24 ; python_version >= "3.10" and python_version < "4.0"
99
+ llama-cloud-services==0.6.9 ; python_version >= "3.10" and python_version < "4.0"
100
+ llama-cloud==0.1.17 ; python_version >= "3.10" and python_version < "4.0"
101
+ llama-index-agent-openai==0.4.6 ; python_version >= "3.10" and python_version < "4.0"
102
+ llama-index-cli==0.4.1 ; python_version >= "3.10" and python_version < "4.0"
103
+ llama-index-core==0.12.28 ; python_version >= "3.10" and python_version < "4.0"
104
+ llama-index-embeddings-huggingface==0.5.2 ; python_version >= "3.10" and python_version < "4.0"
105
+ llama-index-embeddings-openai==0.3.1 ; python_version >= "3.10" and python_version < "4.0"
106
+ llama-index-indices-managed-llama-cloud==0.6.10 ; python_version >= "3.10" and python_version < "4.0"
107
+ llama-index-llms-openai==0.3.29 ; python_version >= "3.10" and python_version < "4.0"
108
+ llama-index-multi-modal-llms-openai==0.4.3 ; python_version >= "3.10" and python_version < "4.0"
109
+ llama-index-program-openai==0.3.1 ; python_version >= "3.10" and python_version < "4.0"
110
+ llama-index-question-gen-openai==0.3.0 ; python_version >= "3.10" and python_version < "4.0"
111
+ llama-index-readers-file==0.4.7 ; python_version >= "3.10" and python_version < "4.0"
112
+ llama-index-readers-llama-parse==0.4.0 ; python_version >= "3.10" and python_version < "4.0"
113
+ llama-index==0.12.28 ; python_version >= "3.10" and python_version < "4.0"
114
+ llama-parse==0.6.4.post1 ; python_version >= "3.10" and python_version < "4.0"
115
+ mako==1.3.9 ; python_version >= "3.10" and python_version < "4.0"
116
+ markdown-it-py==3.0.0 ; python_version >= "3.10" and python_version < "4.0"
117
+ markupsafe==3.0.2 ; python_version >= "3.10" and python_version < "4.0"
118
+ marshmallow==3.26.1 ; python_version >= "3.10" and python_version < "4.0"
119
+ matplotlib-inline==0.1.7 ; python_version >= "3.10" and python_version < "4.0"
120
+ mdurl==0.1.2 ; python_version >= "3.10" and python_version < "4.0"
121
+ mistune==3.1.3 ; python_version >= "3.10" and python_version < "4.0"
122
+ more-itertools==10.6.0 ; python_version >= "3.10" and python_version < "4.0"
123
+ mpmath==1.3.0 ; python_version >= "3.10" and python_version < "4.0"
124
+ msgpack==1.1.0 ; python_version >= "3.10" and python_version < "4.0"
125
+ multidict==6.3.1 ; python_version >= "3.10" and python_version < "4.0"
126
+ munch==2.5.0 ; python_version >= "3.10" and python_version < "4.0"
127
+ mypy-extensions==1.0.0 ; python_version >= "3.10" and python_version < "4.0"
128
+ narwhals==1.33.0 ; python_version >= "3.10" and python_version < "4.0"
129
+ nbclient==0.10.2 ; python_version >= "3.10" and python_version < "4.0"
130
+ nbconvert==7.16.6 ; python_version >= "3.10" and python_version < "4.0"
131
+ nbformat==5.10.4 ; python_version >= "3.10" and python_version < "4.0"
132
+ nest-asyncio==1.6.0 ; python_version >= "3.10" and python_version < "4.0"
133
+ networkx==3.4.2 ; python_version >= "3.10" and python_version < "4.0"
134
+ nltk==3.9.1 ; python_version >= "3.10" and python_version < "4.0"
135
+ notebook-shim==0.2.4 ; python_version >= "3.10" and python_version < "4.0"
136
+ notebook==7.3.3 ; python_version >= "3.10" and python_version < "4.0"
137
+ numpy==2.2.4 ; python_version >= "3.10" and python_version < "4.0"
138
+ nvidia-cublas-cu12==12.4.5.8 ; python_version >= "3.10" and python_version < "4.0" and platform_machine == "x86_64" and platform_system == "Linux"
139
+ nvidia-cuda-cupti-cu12==12.4.127 ; python_version >= "3.10" and python_version < "4.0" and platform_machine == "x86_64" and platform_system == "Linux"
140
+ nvidia-cuda-nvrtc-cu12==12.4.127 ; python_version >= "3.10" and python_version < "4.0" and platform_machine == "x86_64" and platform_system == "Linux"
141
+ nvidia-cuda-runtime-cu12==12.4.127 ; python_version >= "3.10" and python_version < "4.0" and platform_machine == "x86_64" and platform_system == "Linux"
142
+ nvidia-cudnn-cu12==9.1.0.70 ; python_version >= "3.10" and python_version < "4.0" and platform_machine == "x86_64" and platform_system == "Linux"
143
+ nvidia-cufft-cu12==11.2.1.3 ; python_version >= "3.10" and python_version < "4.0" and platform_machine == "x86_64" and platform_system == "Linux"
144
+ nvidia-curand-cu12==10.3.5.147 ; python_version >= "3.10" and python_version < "4.0" and platform_machine == "x86_64" and platform_system == "Linux"
145
+ nvidia-cusolver-cu12==11.6.1.9 ; python_version >= "3.10" and python_version < "4.0" and platform_machine == "x86_64" and platform_system == "Linux"
146
+ nvidia-cusparse-cu12==12.3.1.170 ; python_version >= "3.10" and python_version < "4.0" and platform_machine == "x86_64" and platform_system == "Linux"
147
+ nvidia-cusparselt-cu12==0.6.2 ; python_version >= "3.10" and python_version < "4.0" and platform_machine == "x86_64" and platform_system == "Linux"
148
+ nvidia-nccl-cu12==2.21.5 ; python_version >= "3.10" and python_version < "4.0" and platform_machine == "x86_64" and platform_system == "Linux"
149
+ nvidia-nvjitlink-cu12==12.4.127 ; python_version >= "3.10" and python_version < "4.0" and platform_machine == "x86_64" and platform_system == "Linux"
150
+ nvidia-nvtx-cu12==12.4.127 ; python_version >= "3.10" and python_version < "4.0" and platform_machine == "x86_64" and platform_system == "Linux"
151
+ openai==1.70.0 ; python_version >= "3.10" and python_version < "4.0"
152
+ opentelemetry-api==1.31.1 ; python_version >= "3.10" and python_version < "4.0"
153
+ opentelemetry-proto==1.31.1 ; python_version >= "3.10" and python_version < "4.0"
154
+ opentelemetry-sdk==1.31.1 ; python_version >= "3.10" and python_version < "4.0"
155
+ opentelemetry-semantic-conventions==0.52b1 ; python_version >= "3.10" and python_version < "4.0"
156
+ orjson==3.10.16 ; python_version >= "3.10" and python_version < "4.0" and platform_python_implementation != "PyPy"
157
+ overrides==7.7.0 ; python_version >= "3.10" and python_version < "4.0"
158
+ packaging==24.2 ; python_version >= "3.10" and python_version < "4.0"
159
+ pandas==2.2.3 ; python_version >= "3.10" and python_version < "4.0"
160
+ pandocfilters==1.5.1 ; python_version >= "3.10" and python_version < "4.0"
161
+ parso==0.8.4 ; python_version >= "3.10" and python_version < "4.0"
162
+ pexpect==4.9.0 ; python_version >= "3.10" and python_version < "4.0"
163
+ pillow==11.1.0 ; python_version >= "3.10" and python_version < "4.0"
164
+ pkginfo==1.12.1.2 ; python_version >= "3.10" and python_version < "4.0"
165
+ platformdirs==4.3.7 ; python_version >= "3.10" and python_version < "4.0"
166
+ plotly==5.24.1 ; python_version >= "3.10" and python_version < "4.0"
167
+ poetry-core==1.9.1 ; python_version >= "3.10" and python_version < "4.0"
168
+ poetry-plugin-export==1.8.0 ; python_version >= "3.10" and python_version < "4.0"
169
+ poetry==1.8.5 ; python_version >= "3.10" and python_version < "4.0"
170
+ prometheus-client==0.21.1 ; python_version >= "3.10" and python_version < "4.0"
171
+ prompt-toolkit==3.0.50 ; python_version >= "3.10" and python_version < "4.0"
172
+ propcache==0.3.1 ; python_version >= "3.10" and python_version < "4.0"
173
+ protobuf==5.29.4 ; python_version >= "3.10" and python_version < "4.0"
174
+ psutil==5.9.8 ; python_version >= "3.10" and python_version < "4.0"
175
+ ptyprocess==0.7.0 ; python_version >= "3.10" and python_version < "4.0"
176
+ pure-eval==0.2.3 ; python_version >= "3.10" and python_version < "4.0"
177
+ pyarrow==19.0.1 ; python_version >= "3.10" and python_version < "4.0"
178
+ pycparser==2.22 ; python_version >= "3.10" and python_version < "4.0"
179
+ pydantic-core==2.33.0 ; python_version >= "3.10" and python_version < "4.0"
180
+ pydantic-settings==2.8.1 ; python_version >= "3.10" and python_version < "4.0"
181
+ pydantic==2.11.1 ; python_version >= "3.10" and python_version < "4.0"
182
+ pydeck==0.9.1 ; python_version >= "3.10" and python_version < "4.0"
183
+ pygments==2.19.1 ; python_version >= "3.10" and python_version < "4.0"
184
+ pypdf==5.4.0 ; python_version >= "3.10" and python_version < "4.0"
185
+ pyproject-hooks==1.2.0 ; python_version >= "3.10" and python_version < "4.0"
186
+ python-dateutil==2.9.0.post0 ; python_version >= "3.10" and python_version < "4.0"
187
+ python-decouple==3.8 ; python_version >= "3.10" and python_version < "4.0"
188
+ python-dotenv==1.1.0 ; python_version >= "3.10" and python_version < "4.0"
189
+ python-json-logger==3.3.0 ; python_version >= "3.10" and python_version < "4.0"
190
+ pytz==2025.2 ; python_version >= "3.10" and python_version < "4.0"
191
+ pywin32-ctypes==0.2.3 ; python_version >= "3.10" and python_version < "4.0" and sys_platform == "win32"
192
+ pywin32==310 ; python_version >= "3.10" and python_version < "4.0" and platform_python_implementation != "PyPy" and sys_platform == "win32"
193
+ pywinpty==2.0.15 ; python_version >= "3.10" and python_version < "4.0" and os_name == "nt"
194
+ pyyaml==6.0.2 ; python_version >= "3.10" and python_version < "4.0"
195
+ pyzmq==26.3.0 ; python_version >= "3.10" and python_version < "4.0"
196
+ rapidfuzz==3.12.2 ; python_version >= "3.10" and python_version < "4.0"
197
+ referencing==0.36.2 ; python_version >= "3.10" and python_version < "4.0"
198
+ regex==2024.11.6 ; python_version >= "3.10" and python_version < "4.0"
199
+ requests-toolbelt==1.0.0 ; python_version >= "3.10" and python_version < "4.0"
200
+ requests==2.32.3 ; python_version >= "3.10" and python_version < "4.0"
201
+ rfc3339-validator==0.1.4 ; python_version >= "3.10" and python_version < "4.0"
202
+ rfc3986-validator==0.1.1 ; python_version >= "3.10" and python_version < "4.0"
203
+ rich==13.9.4 ; python_version >= "3.10" and python_version < "4.0"
204
+ rpds-py==0.24.0 ; python_version >= "3.10" and python_version < "4.0"
205
+ safetensors==0.5.3 ; python_version >= "3.10" and python_version < "4.0"
206
+ scikit-learn==1.6.1 ; python_version >= "3.10" and python_version < "4.0"
207
+ scipy==1.15.2 ; python_version >= "3.10" and python_version < "4.0"
208
+ secretstorage==3.3.3 ; python_version >= "3.10" and python_version < "4.0" and sys_platform == "linux"
209
+ send2trash==1.8.3 ; python_version >= "3.10" and python_version < "4.0"
210
+ sentence-transformers==4.0.2 ; python_version >= "3.10" and python_version < "4.0"
211
+ setuptools==78.1.0 ; python_version >= "3.10" and python_version < "4.0"
212
+ shellingham==1.5.4 ; python_version >= "3.10" and python_version < "4.0"
213
+ six==1.17.0 ; python_version >= "3.10" and python_version < "4.0"
214
+ smmap==5.0.2 ; python_version >= "3.10" and python_version < "4.0"
215
+ sniffio==1.3.1 ; python_version >= "3.10" and python_version < "4.0"
216
+ soupsieve==2.6 ; python_version >= "3.10" and python_version < "4.0"
217
+ sqlalchemy==2.0.40 ; python_version >= "3.10" and python_version < "4.0"
218
+ stack-data==0.6.3 ; python_version >= "3.10" and python_version < "4.0"
219
+ streamlit-aggrid==1.1.2 ; python_version >= "3.10" and python_version < "4.0"
220
+ streamlit==1.44.1 ; python_version >= "3.10" and python_version < "4.0"
221
+ striprtf==0.0.26 ; python_version >= "3.10" and python_version < "4.0"
222
+ sympy==1.13.1 ; python_version >= "3.10" and python_version < "4.0"
223
+ tenacity==9.1.2 ; python_version >= "3.10" and python_version < "4.0"
224
+ terminado==0.18.1 ; python_version >= "3.10" and python_version < "4.0"
225
+ threadpoolctl==3.6.0 ; python_version >= "3.10" and python_version < "4.0"
226
+ tiktoken==0.9.0 ; python_version >= "3.10" and python_version < "4.0"
227
+ tinycss2==1.4.0 ; python_version >= "3.10" and python_version < "4.0"
228
+ tokenizers==0.21.1 ; python_version >= "3.10" and python_version < "4.0"
229
+ toml==0.10.2 ; python_version >= "3.10" and python_version < "4.0"
230
+ tomli==2.2.1 ; python_version == "3.10"
231
+ tomlkit==0.13.2 ; python_version >= "3.10" and python_version < "4.0"
232
+ torch==2.6.0 ; python_version >= "3.10" and python_version < "4.0"
233
+ tornado==6.4.2 ; python_version >= "3.10" and python_version < "4.0"
234
+ tqdm==4.67.1 ; python_version >= "3.10" and python_version < "4.0"
235
+ traitlets==5.14.3 ; python_version >= "3.10" and python_version < "4.0"
236
+ transformers==4.50.3 ; python_version >= "3.10" and python_version < "4.0"
237
+ triton==3.2.0 ; python_version >= "3.10" and python_version < "4.0" and platform_machine == "x86_64" and platform_system == "Linux"
238
+ trove-classifiers==2025.3.19.19 ; python_version >= "3.10" and python_version < "4.0"
239
+ trulens-apps-langchain==1.4.8 ; python_version >= "3.10" and python_version < "4.0"
240
+ trulens-apps-llamaindex==1.4.8 ; python_version >= "3.10" and python_version < "4.0"
241
+ trulens-core==1.4.7 ; python_version >= "3.10" and python_version < "4.0"
242
+ trulens-dashboard==1.4.7 ; python_version >= "3.10" and python_version < "4.0"
243
+ trulens-eval==1.4.7 ; python_version >= "3.10" and python_version < "4.0"
244
+ trulens-feedback==1.4.7 ; python_version >= "3.10" and python_version < "4.0"
245
+ trulens-otel-semconv==1.4.7 ; python_version >= "3.10" and python_version < "4.0"
246
+ trulens-providers-openai==1.4.8 ; python_version >= "3.10" and python_version < "4.0"
247
+ trulens==1.4.7 ; python_version >= "3.10" and python_version < "4.0"
248
+ types-python-dateutil==2.9.0.20241206 ; python_version >= "3.10" and python_version < "4.0"
249
+ typing-extensions==4.13.0 ; python_version >= "3.10" and python_version < "4.0"
250
+ typing-inspect==0.9.0 ; python_version >= "3.10" and python_version < "4.0"
251
+ typing-inspection==0.4.0 ; python_version >= "3.10" and python_version < "4.0"
252
+ tzdata==2025.2 ; python_version >= "3.10" and python_version < "4.0"
253
+ uri-template==1.3.0 ; python_version >= "3.10" and python_version < "4.0"
254
+ urllib3==2.3.0 ; python_version >= "3.10" and python_version < "4.0"
255
+ virtualenv==20.30.0 ; python_version >= "3.10" and python_version < "4.0"
256
+ watchdog==6.0.0 ; python_version >= "3.10" and python_version < "4.0" and platform_system != "Darwin"
257
+ wcwidth==0.2.13 ; python_version >= "3.10" and python_version < "4.0"
258
+ webcolors==24.11.1 ; python_version >= "3.10" and python_version < "4.0"
259
+ webencodings==0.5.1 ; python_version >= "3.10" and python_version < "4.0"
260
+ websocket-client==1.8.0 ; python_version >= "3.10" and python_version < "4.0"
261
+ widgetsnbextension==4.0.13 ; python_version >= "3.10" and python_version < "4.0"
262
+ wrapt==1.17.2 ; python_version >= "3.10" and python_version < "4.0"
263
+ xattr==1.1.4 ; python_version >= "3.10" and python_version < "4.0" and sys_platform == "darwin"
264
+ yarl==1.18.3 ; python_version >= "3.10" and python_version < "4.0"
265
+ zipp==3.21.0 ; python_version >= "3.10" and python_version < "4.0"
266
+ zstandard==0.23.0 ; python_version >= "3.10" and python_version < "4.0"
src/mythesis_chatbot/rag_setup.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from typing import Literal
4
+
5
+ import openai
6
+ from llama_index.core import (
7
+ Document,
8
+ Settings,
9
+ SimpleDirectoryReader,
10
+ StorageContext,
11
+ VectorStoreIndex,
12
+ load_index_from_storage,
13
+ )
14
+ from llama_index.core.node_parser import (
15
+ HierarchicalNodeParser,
16
+ SentenceWindowNodeParser,
17
+ get_leaf_nodes,
18
+ )
19
+ from llama_index.core.postprocessor import (
20
+ MetadataReplacementPostProcessor,
21
+ SentenceTransformerRerank,
22
+ )
23
+ from llama_index.core.query_engine import RetrieverQueryEngine
24
+ from llama_index.core.retrievers import AutoMergingRetriever
25
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
26
+ from llama_index.llms.openai import OpenAI
27
+
28
+ from mythesis_chatbot.utils import get_config_hash, get_openai_api_key
29
+
30
+ SupportedRags = Literal["basic", "sentence window retrieval", "auto-merging retrieval"]
31
+ SupportedOpenAIllms = Literal["gpt-4o-mini", "gpt-3.5-turbo"]
32
+ SupportedEmbedModels = Literal["BAAI/bge-small-en-v1.5"]
33
+ SupportedRerankModels = Literal["cross-encoder/ms-marco-MiniLM-L-2-v2"]
34
+
35
+
36
+ def load_data(input_file: str) -> Document:
37
+
38
+ reader = SimpleDirectoryReader(input_files=[input_file])
39
+ documents = reader.load_data() # List of Document objects (one object per page)
40
+ # Merge into single document
41
+ document = Document(text="\n\n".join([doc.text for doc in documents]))
42
+
43
+ return document
44
+
45
+
46
+ def build_sentence_window_index(
47
+ input_file: str,
48
+ save_dir: str,
49
+ index_config: dict[str, str | int],
50
+ ):
51
+ config_hash = get_config_hash(index_config)
52
+ save_dir = os.path.join(save_dir, "sentence_window", config_hash)
53
+
54
+ Settings.embed_model = HuggingFaceEmbedding(model_name=index_config["embed_model"])
55
+
56
+ if not os.path.exists(save_dir):
57
+
58
+ document = load_data(input_file)
59
+
60
+ # Create the sentence window node parser w/ default settings.
61
+ # A node is a chunck of text. Each node returned by the sentence window node
62
+ # parser also contains its context as metadata (closest chuncks of texts)
63
+ node_parser = SentenceWindowNodeParser.from_defaults(
64
+ window_size=index_config["sentence_window_size"],
65
+ window_metadata_key="window",
66
+ original_text_metadata_key="original_text",
67
+ )
68
+
69
+ Settings.node_parser = node_parser
70
+
71
+ sentence_index = VectorStoreIndex.from_documents([document])
72
+ sentence_index.storage_context.persist(persist_dir=save_dir)
73
+ with open(os.path.join(save_dir, "meta.json"), "w") as f:
74
+ json.dump(index_config, f, indent=2)
75
+
76
+ else:
77
+ sentence_index = load_index_from_storage(
78
+ StorageContext.from_defaults(persist_dir=save_dir)
79
+ )
80
+
81
+ return sentence_index
82
+
83
+
84
+ def build_automerging_index(
85
+ input_file: str,
86
+ save_dir: str,
87
+ index_config: dict[str, str | list[int]],
88
+ ):
89
+
90
+ config_hash = get_config_hash(index_config)
91
+ save_dir = os.path.join(save_dir, "auto_merging", config_hash)
92
+
93
+ Settings.embed_model = HuggingFaceEmbedding(model_name=index_config["embed_model"])
94
+
95
+ if not os.path.exists(save_dir):
96
+
97
+ document = load_data(input_file)
98
+ node_parser = HierarchicalNodeParser.from_defaults(
99
+ chunk_sizes=index_config["chunk_sizes"]
100
+ )
101
+ nodes = node_parser.get_nodes_from_documents([document])
102
+ leaf_nodes = get_leaf_nodes(nodes)
103
+
104
+ Settings.node_parser = node_parser
105
+
106
+ storage_context = StorageContext.from_defaults()
107
+ storage_context.docstore.add_documents(nodes)
108
+
109
+ automerging_index = VectorStoreIndex(
110
+ leaf_nodes,
111
+ storage_context=storage_context,
112
+ )
113
+ automerging_index.storage_context.persist(persist_dir=save_dir)
114
+ with open(os.path.join(save_dir, "meta.json"), "w") as f:
115
+ json.dump(index_config, f, indent=2)
116
+
117
+ else:
118
+ automerging_index = load_index_from_storage(
119
+ StorageContext.from_defaults(persist_dir=save_dir),
120
+ )
121
+ return automerging_index
122
+
123
+
124
+ def get_sentence_window_query_engine(
125
+ sentence_index,
126
+ similarity_top_k: int = 6,
127
+ rerank_top_n: int = 2,
128
+ rerank_model: str = "cross-encoder/ms-marco-MiniLM-L-2-v2",
129
+ ):
130
+ # Used to replace the node content with a field from the node metadata.
131
+ postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
132
+
133
+ # Rerank can speed up an LLM query without sacrificing accuracy. It does so by
134
+ # pruning away irrelevant nodes from the context.
135
+ rerank = SentenceTransformerRerank(top_n=rerank_top_n, model=rerank_model)
136
+
137
+ sentence_window_engine = sentence_index.as_query_engine(
138
+ similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank]
139
+ )
140
+ return sentence_window_engine
141
+
142
+
143
+ def get_automerging_query_engine(
144
+ automerging_index,
145
+ similarity_top_k: int = 12,
146
+ rerank_top_n: int = 6,
147
+ rerank_model: str = "cross-encoder/ms-marco-MiniLM-L-2-v2",
148
+ ):
149
+ base_retriever = automerging_index.as_retriever(similarity_top_k=similarity_top_k)
150
+ retriever = AutoMergingRetriever(
151
+ base_retriever, automerging_index.storage_context, verbose=True
152
+ )
153
+ rerank = SentenceTransformerRerank(top_n=rerank_top_n, model=rerank_model)
154
+ auto_merging_engine = RetrieverQueryEngine.from_args(
155
+ retriever, node_postprocessors=[rerank]
156
+ )
157
+ return auto_merging_engine
158
+
159
+
160
+ def sentence_window_retrieval_setup(
161
+ input_file: str,
162
+ save_dir: str,
163
+ llm_openai_model: SupportedOpenAIllms = "gpt-4o-mini",
164
+ temperature: float = 0.1,
165
+ embed_model: SupportedEmbedModels = "BAAI/bge-small-en-v1.5",
166
+ sentence_window_size: int = 3,
167
+ similarity_top_k: int = 6,
168
+ rerank_model: SupportedRerankModels = "cross-encoder/ms-marco-MiniLM-L-2-v2",
169
+ rerank_top_n: int = 2,
170
+ ):
171
+
172
+ openai.api_key = get_openai_api_key()
173
+
174
+ # This allows to uniquely identify the index
175
+ config = {
176
+ "doc_source": os.path.basename(input_file),
177
+ "embed_model": embed_model,
178
+ "sentence_window_size": sentence_window_size,
179
+ }
180
+
181
+ # 1. Build index
182
+ index = build_sentence_window_index(input_file, save_dir, config)
183
+
184
+ Settings.llm = OpenAI(model=llm_openai_model, temperature=temperature)
185
+
186
+ # 2. Get engine
187
+ sentence_window_engine = get_sentence_window_query_engine(
188
+ index,
189
+ similarity_top_k=similarity_top_k,
190
+ rerank_model=rerank_model,
191
+ rerank_top_n=rerank_top_n,
192
+ )
193
+
194
+ return sentence_window_engine
195
+
196
+
197
+ def automerging_retrieval_setup(
198
+ input_file: str,
199
+ save_dir: str,
200
+ llm_openai_model: SupportedOpenAIllms = "gpt-4o-mini",
201
+ temperature: float = 0.1,
202
+ embed_model: SupportedEmbedModels = "BAAI/bge-small-en-v1.5",
203
+ chunk_sizes=[2048, 512, 128],
204
+ similarity_top_k: int = 6,
205
+ rerank_model: SupportedRerankModels = "cross-encoder/ms-marco-MiniLM-L-2-v2",
206
+ rerank_top_n: int = 2,
207
+ ):
208
+ openai.api_key = get_openai_api_key()
209
+
210
+ # This allows to uniquely identify the index
211
+ config = {
212
+ "doc_source": os.path.basename(input_file),
213
+ "embed_model": embed_model,
214
+ "chunk_sizes": chunk_sizes,
215
+ }
216
+
217
+ # 1. Build index
218
+ index = build_automerging_index(input_file, save_dir, config)
219
+
220
+ Settings.llm = OpenAI(model=llm_openai_model, temperature=temperature)
221
+
222
+ # 2. Get engine
223
+ automerging_engine = get_sentence_window_query_engine(
224
+ index,
225
+ similarity_top_k=similarity_top_k,
226
+ rerank_model=rerank_model,
227
+ rerank_top_n=rerank_top_n,
228
+ )
229
+
230
+ return automerging_engine
231
+
232
+
233
+ def basic_rag_setup(
234
+ input_file: str,
235
+ save_dir: str,
236
+ llm_openai_model: SupportedOpenAIllms = "gpt-4o-mini",
237
+ temperature: float = 0.1,
238
+ embed_model: SupportedEmbedModels = "BAAI/bge-small-en-v1.5",
239
+ similarity_top_k: int = 6,
240
+ rerank_model: SupportedRerankModels = "cross-encoder/ms-marco-MiniLM-L-2-v2",
241
+ rerank_top_n: int = 2,
242
+ ):
243
+ openai.api_key = get_openai_api_key()
244
+
245
+ Settings.embed_model = HuggingFaceEmbedding(model_name=embed_model)
246
+
247
+ save_dir = os.path.join(save_dir, "basic")
248
+ if not os.path.exists(save_dir):
249
+ document = load_data(input_file)
250
+ index = VectorStoreIndex.from_documents([document])
251
+ index.storage_context.persist(persist_dir=save_dir)
252
+ else:
253
+ index = load_index_from_storage(
254
+ StorageContext.from_defaults(persist_dir=save_dir)
255
+ )
256
+
257
+ rerank = SentenceTransformerRerank(top_n=rerank_top_n, model=rerank_model)
258
+
259
+ engine = index.as_query_engine(
260
+ llm=OpenAI(model=llm_openai_model, temperature=temperature),
261
+ similarity_top_k=similarity_top_k,
262
+ node_postprocessors=[rerank],
263
+ )
264
+ return engine
src/mythesis_chatbot/utils.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import json
3
+ import os
4
+
5
+
6
+ def get_config_hash(config: dict) -> str:
7
+ # Use JSON to serialize and sort keys for deterministic output
8
+ config_str = json.dumps(config, sort_keys=True)
9
+
10
+ return hashlib.sha256(config_str.encode()).hexdigest()[:10] # short hash
11
+
12
+
13
+ def get_openai_api_key():
14
+ """
15
+ Get the OpenAI API key from an environment variable.
16
+ """
17
+ api_key = os.getenv("OPENAI_API_KEY")
18
+ if api_key:
19
+ return api_key
20
+
21
+ raise ValueError(
22
+ "OpenAI API key not found. Please follow the instruction in the readme file."
23
+ )