Linhz commited on
Commit
78cfd3f
·
verified ·
1 Parent(s): 272b50a

Upload 10 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ sentence_embeddings_index_no_citation.faiss filter=lfs diff=lfs merge=lfs -text
.streamlit/secrets.toml ADDED
@@ -0,0 +1 @@
 
 
1
+ OPENAI_API_KEY = "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"
App3.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import streamlit as st
3
+ import faiss
4
+ from sentence_transformers import SentenceTransformer
5
+ import pickle
6
+ import re
7
+ from transformers import pipeline
8
+
9
+
10
+
11
+
12
+ st.title("Vietnamese Legal Question Answering System")
13
+
14
+ with open('articles.pkl', 'rb') as file:
15
+ articles = pickle.load(file)
16
+
17
+ index_loaded = faiss.read_index("sentence_embeddings_index_no_citation.faiss")
18
+
19
+ if 'model_embedding' not in st.session_state:
20
+ print("ERROR")
21
+ st.session_state.model_embedding = SentenceTransformer('bkai-foundation-models/vietnamese-bi-encoder')
22
+
23
+
24
+
25
+ # Replace this with your own checkpoint
26
+ model_checkpoint = "model"
27
+ question_answerer = pipeline("question-answering", model=model_checkpoint)
28
+ def question_answering(question):
29
+ print(question)
30
+ query_sentence = [question]
31
+ query_embedding = st.session_state.model_embedding.encode(query_sentence)
32
+ k = 5
33
+ D, I = index_loaded.search(query_embedding.astype('float32'), k) # D is distances, I is indices
34
+ answer = [question_answerer(question=query_sentence[0], context=articles[I[0][i]], max_answer_len = 512) for i in range(k)]
35
+ best_answer = max(answer, key=lambda x: x['score'])
36
+ print(best_answer['answer'])
37
+ if best_answer['score'] > 0.5:
38
+ return best_answer['answer']
39
+ return f"Tôi không chắc lắm nhưng có lẽ câu trả lời là: {best_answer['answer']}"
40
+
41
+ if "messages" not in st.session_state:
42
+ st.session_state.messages = []
43
+
44
+ for message in st.session_state.messages:
45
+ with st.chat_message(message["role"]):
46
+ st.markdown(message["content"])
47
+
48
+
49
+ def clean_answer(s):
50
+ # Sử dụng regex để loại bỏ tất cả các ký tự đặc biệt ở cuối chuỗi
51
+ return re.sub(r'[^a-zA-Z0-9]+$', '', s)
52
+
53
+ if prompt := st.chat_input("What is up?"):
54
+ st.session_state.messages.append({"role": "user", "content": prompt})
55
+ with st.chat_message("user"):
56
+ st.markdown(prompt)
57
+ response = clean_answer(question_answering(prompt))
58
+ with st.chat_message("assistant"):
59
+ st.markdown(response)
60
+
61
+ st.session_state.messages.append({"role": "assistant", "content": response})
articles.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ab327307b8445059e2651ec102473c81b909f393fe0e7ab3b08fa54574cbef3
3
+ size 112645333
model/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "nguyenvulebinh/vi-mrc-large",
3
+ "architectures": [
4
+ "RobertaForQuestionAnswering"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "gradient_checkpointing": false,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 1024,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 4096,
16
+ "layer_norm_eps": 1e-05,
17
+ "max_position_embeddings": 514,
18
+ "model_type": "roberta",
19
+ "num_attention_heads": 16,
20
+ "num_hidden_layers": 24,
21
+ "output_past": true,
22
+ "pad_token_id": 1,
23
+ "position_embedding_type": "absolute",
24
+ "torch_dtype": "float32",
25
+ "transformers_version": "4.26.1",
26
+ "type_vocab_size": 1,
27
+ "use_cache": true,
28
+ "vocab_size": 250002
29
+ }
model/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c754249d7b05fac9b7f4fa8e9655c331e2a39e1d250163a71f12e939f2ac1725
3
+ size 2235510893
model/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
model/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63c73edbc2edd6c83927ce77f0f51131ae759bae511b7b1a0258cb8161f56dd0
3
+ size 17082924
model/tokenizer_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "__type": "AddedToken",
7
+ "content": "<mask>",
8
+ "lstrip": true,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "model_max_length": 512,
14
+ "name_or_path": "nguyenvulebinh/vi-mrc-large",
15
+ "pad_token": "<pad>",
16
+ "sep_token": "</s>",
17
+ "special_tokens_map_file": null,
18
+ "tokenizer_class": "XLMRobertaTokenizer",
19
+ "unk_token": "<unk>"
20
+ }
model/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ba966667f50e05cba202f5926f0a98d9d1bf217623435eaf15f0b00cd7cfc44
3
+ size 3515
sentence_embeddings_index_no_citation.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75ad281e6fdb666da835ef1163bff941bc21344cafec6809f58e823e4ac9dc8b
3
+ size 199836717