diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..197ba266d95ffd12bae34d2decbed40e80164d22 --- /dev/null +++ b/app.py @@ -0,0 +1,38 @@ +import gradio as gr +from transformers import AutoTokenizer +import torch +from tiny_finbert import TinyFinBERTRegressor, preprocess_texts +import os + +MODEL_DIR = "./saved_model" +DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR) + +model = TinyFinBERTRegressor().to(DEVICE) +model.load_state_dict(torch.load(os.path.join(MODEL_DIR, "regressor_model.pt"), map_location=DEVICE)) +model.eval() + +def predict_sentiment(text): + processed = preprocess_texts([text])[0] + inputs = tokenizer(processed, return_tensors="pt", truncation=True, padding='max_length', max_length=128) + inputs = {k: v.to(DEVICE) for k, v in inputs.items() if k != "token_type_ids"} + with torch.no_grad(): + score = model(**inputs)["score"].item() + + if score > 0.3: + interpretation = "positive" + elif score < -0.3: + interpretation = "negative" + else: + interpretation = "neutral" + return {"score": round(score, 4), "interpretation": interpretation} + +iface = gr.Interface(fn=predict_sentiment, + inputs=gr.Textbox(label="Enter financial sentence"), + outputs=[ + gr.Number(label="Sentiment Score"), + gr.Textbox(label="Interpretation") + ], + title="TinyFinBERT Sentiment Analysis") + +iface.launch() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..3d829b7c933f565a75ff8dd34b34d616c6398ccf --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +torch>=1.10.0 +accelerate>=0.20.1 +transformers +gradio +pydantic +pandas +scikit-learn +spacy +nltk +datasets +en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz diff --git a/saved_model/regressor_model.pt b/saved_model/regressor_model.pt new file mode 100644 index 0000000000000000000000000000000000000000..094c3e7372278abe459a14597dbcc7fc52b4cbc1 --- /dev/null +++ b/saved_model/regressor_model.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50353c6a0c9b2f79c90d554c89d132901f45301ede1e293d2484c86a5b02175d +size 57430528 diff --git a/saved_model/results/checkpoint-1378/optimizer.pt b/saved_model/results/checkpoint-1378/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e2ff6b7ed6c2775c24cc448b66285ad64b4a7ad3 --- /dev/null +++ b/saved_model/results/checkpoint-1378/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f03a9de2881a1aa26e9819fcb5f4a4a18e24cc70117f96613455c179d004170 +size 114061259 diff --git a/saved_model/results/checkpoint-1378/pytorch_model.bin b/saved_model/results/checkpoint-1378/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..62ecc24a0bcdeb0c28e551be2ea4bb85d1c6e3f9 --- /dev/null +++ b/saved_model/results/checkpoint-1378/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8112307195840f5c66c27f61c3dcc7973026d9204aab43009de991b98c4e8cca +size 57430370 diff --git a/saved_model/results/checkpoint-1378/rng_state.pth b/saved_model/results/checkpoint-1378/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a71c0df81bfc6eabcace6d8ce1fa68ec8fb776d2 --- /dev/null +++ b/saved_model/results/checkpoint-1378/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d4f5b079295aad1d803fe541076c455d9c83ab7bb784cae20338d7514661e4c +size 14455 diff --git a/saved_model/results/checkpoint-1378/scheduler.pt b/saved_model/results/checkpoint-1378/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cb9405adfb0a7713afdbe823e2ee77ed3ecc019d --- /dev/null +++ b/saved_model/results/checkpoint-1378/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2ef41ef574dc1cd1bbcb4157a461b97dbe088d54c5182b29e445da15823fdd0 +size 1465 diff --git a/saved_model/results/checkpoint-1378/special_tokens_map.json b/saved_model/results/checkpoint-1378/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f018c31f8fbacc2583f66c4f9d486c3987e18720 --- /dev/null +++ b/saved_model/results/checkpoint-1378/special_tokens_map.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c3507f36dff57bce437223db3b3081d1e2b52ec3e56ee55438193ecb2c94dd6 +size 132 diff --git a/saved_model/results/checkpoint-1378/tokenizer.json b/saved_model/results/checkpoint-1378/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..fdeda05537d802dc152c5710d90fe4f9d39a35f8 --- /dev/null +++ b/saved_model/results/checkpoint-1378/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da0e79933b9ed51798a3ae27893d3c5fa4a201126cef75586296df9b4d2c62a0 +size 711661 diff --git a/saved_model/results/checkpoint-1378/tokenizer_config.json b/saved_model/results/checkpoint-1378/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d79f110f85a86b2429972e257c10f51a2fff4677 --- /dev/null +++ b/saved_model/results/checkpoint-1378/tokenizer_config.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e86d243cbe5f8327d7eb8dda2415dce752b0e18da1a1b5e417a856647a06a3c2 +size 409 diff --git a/saved_model/results/checkpoint-1378/trainer_state.json b/saved_model/results/checkpoint-1378/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fc3dd5c36cbf4231eb2044f5037973ca15aab366 --- /dev/null +++ b/saved_model/results/checkpoint-1378/trainer_state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2758e807ae7a58b3e40a56ad066e5b5c68e16091559c13d31bd251653357288 +size 1231 diff --git a/saved_model/results/checkpoint-1378/training_args.bin b/saved_model/results/checkpoint-1378/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..983a7d59005b1a8c52b27ba4479a2d41c4415720 --- /dev/null +++ b/saved_model/results/checkpoint-1378/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fd3afd0bd610815f9b7237ec63bc18799fa4677567d19f3a2cf1d051aff770e +size 4817 diff --git a/saved_model/results/checkpoint-1378/vocab.txt b/saved_model/results/checkpoint-1378/vocab.txt new file mode 100644 index 0000000000000000000000000000000000000000..cada3e340cf0ce35daff62c70d0fb6b705c64dea --- /dev/null +++ b/saved_model/results/checkpoint-1378/vocab.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3 +size 231508 diff --git a/saved_model/results/checkpoint-2067/optimizer.pt b/saved_model/results/checkpoint-2067/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ee7dbd9325fef8d9b78f8ae91f6a1a13815e6d9 --- /dev/null +++ b/saved_model/results/checkpoint-2067/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1417eb6c2dc1a5e29647dd0bd9f71b11300b264876bf0cb9166fc6ac00af7a6f +size 114061259 diff --git a/saved_model/results/checkpoint-2067/pytorch_model.bin b/saved_model/results/checkpoint-2067/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..0970660a64c4cd705f0461524cc7040ad9b74d4f --- /dev/null +++ b/saved_model/results/checkpoint-2067/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbee600b79b13c97df962da93ee3f7fb609dca9ca9b3177c1fd30286d2e056c6 +size 57430370 diff --git a/saved_model/results/checkpoint-2067/rng_state.pth b/saved_model/results/checkpoint-2067/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..eea28330ecf6f391f411817bddf6220c3b0ffc70 --- /dev/null +++ b/saved_model/results/checkpoint-2067/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19087021c29d60d24bf341b2ff7ef8a49ce99facfa2a672ce7a47321957e7847 +size 14455 diff --git a/saved_model/results/checkpoint-2067/scheduler.pt b/saved_model/results/checkpoint-2067/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b4edd8971b5b85aa4da97aee054f58160d582ef8 --- /dev/null +++ b/saved_model/results/checkpoint-2067/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0ff74fb21fc3805313fd6e005ac61803f6ac99896f400de11abbb28a4ad03f4 +size 1465 diff --git a/saved_model/results/checkpoint-2067/special_tokens_map.json b/saved_model/results/checkpoint-2067/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f018c31f8fbacc2583f66c4f9d486c3987e18720 --- /dev/null +++ b/saved_model/results/checkpoint-2067/special_tokens_map.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c3507f36dff57bce437223db3b3081d1e2b52ec3e56ee55438193ecb2c94dd6 +size 132 diff --git a/saved_model/results/checkpoint-2067/tokenizer.json b/saved_model/results/checkpoint-2067/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..fdeda05537d802dc152c5710d90fe4f9d39a35f8 --- /dev/null +++ b/saved_model/results/checkpoint-2067/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da0e79933b9ed51798a3ae27893d3c5fa4a201126cef75586296df9b4d2c62a0 +size 711661 diff --git a/saved_model/results/checkpoint-2067/tokenizer_config.json b/saved_model/results/checkpoint-2067/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d79f110f85a86b2429972e257c10f51a2fff4677 --- /dev/null +++ b/saved_model/results/checkpoint-2067/tokenizer_config.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e86d243cbe5f8327d7eb8dda2415dce752b0e18da1a1b5e417a856647a06a3c2 +size 409 diff --git a/saved_model/results/checkpoint-2067/trainer_state.json b/saved_model/results/checkpoint-2067/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..386aaa30c55835135f10e34981ef040ed83eb93a --- /dev/null +++ b/saved_model/results/checkpoint-2067/trainer_state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b225f975e82dd695da24e907320841e0cd9da2ec901023dbf28b426992be3e29 +size 1773 diff --git a/saved_model/results/checkpoint-2067/training_args.bin b/saved_model/results/checkpoint-2067/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..983a7d59005b1a8c52b27ba4479a2d41c4415720 --- /dev/null +++ b/saved_model/results/checkpoint-2067/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fd3afd0bd610815f9b7237ec63bc18799fa4677567d19f3a2cf1d051aff770e +size 4817 diff --git a/saved_model/results/checkpoint-2067/vocab.txt b/saved_model/results/checkpoint-2067/vocab.txt new file mode 100644 index 0000000000000000000000000000000000000000..cada3e340cf0ce35daff62c70d0fb6b705c64dea --- /dev/null +++ b/saved_model/results/checkpoint-2067/vocab.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3 +size 231508 diff --git a/saved_model/results/checkpoint-2756/optimizer.pt b/saved_model/results/checkpoint-2756/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f24ae50c14630bb13781f21dc290de1f1c64413 --- /dev/null +++ b/saved_model/results/checkpoint-2756/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a77971f3ef5257c681512b88f87cb17daf2f91f91491728257cd3e61bc60903f +size 114061259 diff --git a/saved_model/results/checkpoint-2756/pytorch_model.bin b/saved_model/results/checkpoint-2756/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..cf840f8325f0dfbb87b425c484a17611e4cafce4 --- /dev/null +++ b/saved_model/results/checkpoint-2756/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e107512c2468a61c8a78723991aa7b9d8bf5a6fd7f61595327ba644c7f34e12c +size 57430370 diff --git a/saved_model/results/checkpoint-2756/rng_state.pth b/saved_model/results/checkpoint-2756/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..37b755ed10ea71b866e1ac7cd4365d8064d54a73 --- /dev/null +++ b/saved_model/results/checkpoint-2756/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea0bfcb225f0521c65de55ece0620872190fb2b25811d8f38329540a1c5ccef1 +size 14455 diff --git a/saved_model/results/checkpoint-2756/scheduler.pt b/saved_model/results/checkpoint-2756/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6d8284878b7dd02a530fbdc53f26568e1371ac2 --- /dev/null +++ b/saved_model/results/checkpoint-2756/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b9208546db9e82f4ce324b67baa0650331d58e71d35fc86b75106c907bf691d +size 1465 diff --git a/saved_model/results/checkpoint-2756/special_tokens_map.json b/saved_model/results/checkpoint-2756/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f018c31f8fbacc2583f66c4f9d486c3987e18720 --- /dev/null +++ b/saved_model/results/checkpoint-2756/special_tokens_map.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c3507f36dff57bce437223db3b3081d1e2b52ec3e56ee55438193ecb2c94dd6 +size 132 diff --git a/saved_model/results/checkpoint-2756/tokenizer.json b/saved_model/results/checkpoint-2756/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..fdeda05537d802dc152c5710d90fe4f9d39a35f8 --- /dev/null +++ b/saved_model/results/checkpoint-2756/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da0e79933b9ed51798a3ae27893d3c5fa4a201126cef75586296df9b4d2c62a0 +size 711661 diff --git a/saved_model/results/checkpoint-2756/tokenizer_config.json b/saved_model/results/checkpoint-2756/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d79f110f85a86b2429972e257c10f51a2fff4677 --- /dev/null +++ b/saved_model/results/checkpoint-2756/tokenizer_config.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e86d243cbe5f8327d7eb8dda2415dce752b0e18da1a1b5e417a856647a06a3c2 +size 409 diff --git a/saved_model/results/checkpoint-2756/trainer_state.json b/saved_model/results/checkpoint-2756/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..87dc1542d16c00acaef6b0ded30259b6c6f177f7 --- /dev/null +++ b/saved_model/results/checkpoint-2756/trainer_state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21a99f476b71c91fa2782e32a8e5e27f2ee537d5ff893b50468465cf3a0e28e3 +size 2187 diff --git a/saved_model/results/checkpoint-2756/training_args.bin b/saved_model/results/checkpoint-2756/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..983a7d59005b1a8c52b27ba4479a2d41c4415720 --- /dev/null +++ b/saved_model/results/checkpoint-2756/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fd3afd0bd610815f9b7237ec63bc18799fa4677567d19f3a2cf1d051aff770e +size 4817 diff --git a/saved_model/results/checkpoint-2756/vocab.txt b/saved_model/results/checkpoint-2756/vocab.txt new file mode 100644 index 0000000000000000000000000000000000000000..cada3e340cf0ce35daff62c70d0fb6b705c64dea --- /dev/null +++ b/saved_model/results/checkpoint-2756/vocab.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3 +size 231508 diff --git a/saved_model/results/checkpoint-3445/optimizer.pt b/saved_model/results/checkpoint-3445/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..aba1ac48edfb847a43a14d4f5b46774309380bff --- /dev/null +++ b/saved_model/results/checkpoint-3445/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:342aa7c382ab1c04d7945e8d8b58954a9d56e24c80cfc992b77ab13dc6d4f354 +size 114061259 diff --git a/saved_model/results/checkpoint-3445/pytorch_model.bin b/saved_model/results/checkpoint-3445/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..2b79a9889fc8fb970b7b8340fc185de9cff74f64 --- /dev/null +++ b/saved_model/results/checkpoint-3445/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:771059cf19e1bff8311d52fd4edb68b74bbfed48d4bcf7e3a1a3e15cd05b5f64 +size 57430370 diff --git a/saved_model/results/checkpoint-3445/rng_state.pth b/saved_model/results/checkpoint-3445/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..eadf57a18ae27b336f8f203aec5884da54d31f6c --- /dev/null +++ b/saved_model/results/checkpoint-3445/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48c72082d486b20c672f7a5ab044c51e73310526658bee6c305eb79bd83d0758 +size 14455 diff --git a/saved_model/results/checkpoint-3445/scheduler.pt b/saved_model/results/checkpoint-3445/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..fcdd96576b65fa2ec6964ed1a6d926355e356215 --- /dev/null +++ b/saved_model/results/checkpoint-3445/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cad7fef0bfa216665aad66080b3446ec9ba91e2b0edb2c1945d787287b01b6a +size 1465 diff --git a/saved_model/results/checkpoint-3445/special_tokens_map.json b/saved_model/results/checkpoint-3445/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f018c31f8fbacc2583f66c4f9d486c3987e18720 --- /dev/null +++ b/saved_model/results/checkpoint-3445/special_tokens_map.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c3507f36dff57bce437223db3b3081d1e2b52ec3e56ee55438193ecb2c94dd6 +size 132 diff --git a/saved_model/results/checkpoint-3445/tokenizer.json b/saved_model/results/checkpoint-3445/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..fdeda05537d802dc152c5710d90fe4f9d39a35f8 --- /dev/null +++ b/saved_model/results/checkpoint-3445/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da0e79933b9ed51798a3ae27893d3c5fa4a201126cef75586296df9b4d2c62a0 +size 711661 diff --git a/saved_model/results/checkpoint-3445/tokenizer_config.json b/saved_model/results/checkpoint-3445/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d79f110f85a86b2429972e257c10f51a2fff4677 --- /dev/null +++ b/saved_model/results/checkpoint-3445/tokenizer_config.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e86d243cbe5f8327d7eb8dda2415dce752b0e18da1a1b5e417a856647a06a3c2 +size 409 diff --git a/saved_model/results/checkpoint-3445/trainer_state.json b/saved_model/results/checkpoint-3445/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2168e49aa216eb738353a46554cd077293d96e89 --- /dev/null +++ b/saved_model/results/checkpoint-3445/trainer_state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54bf8c6ed97dd610b377df9bbe40deff4e34dd08ebe1fd7edca8bab9d9200241 +size 2603 diff --git a/saved_model/results/checkpoint-3445/training_args.bin b/saved_model/results/checkpoint-3445/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..983a7d59005b1a8c52b27ba4479a2d41c4415720 --- /dev/null +++ b/saved_model/results/checkpoint-3445/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fd3afd0bd610815f9b7237ec63bc18799fa4677567d19f3a2cf1d051aff770e +size 4817 diff --git a/saved_model/results/checkpoint-3445/vocab.txt b/saved_model/results/checkpoint-3445/vocab.txt new file mode 100644 index 0000000000000000000000000000000000000000..cada3e340cf0ce35daff62c70d0fb6b705c64dea --- /dev/null +++ b/saved_model/results/checkpoint-3445/vocab.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3 +size 231508 diff --git a/saved_model/results/checkpoint-689/optimizer.pt b/saved_model/results/checkpoint-689/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..cfa42d99e3dde2d2c4d8c900ec25a506f6c5b77b --- /dev/null +++ b/saved_model/results/checkpoint-689/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02181760a4cc7ab85f271fb776a09189823969b58f07af29008d6849c8657786 +size 114061259 diff --git a/saved_model/results/checkpoint-689/pytorch_model.bin b/saved_model/results/checkpoint-689/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..64f0c094326c1c24c2a1198d8edf7bb36c7352e7 --- /dev/null +++ b/saved_model/results/checkpoint-689/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58d8ef0658ebf6631168d46ce555873ac39963fafa5c3740b0ec4401a48e61d8 +size 57430370 diff --git a/saved_model/results/checkpoint-689/rng_state.pth b/saved_model/results/checkpoint-689/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..8069e072434e05cccb9ff580b872bb5aa88f9633 --- /dev/null +++ b/saved_model/results/checkpoint-689/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a163b7f4b601ffa69e8c26e1eaf05d25873bb6a603fe9efa29bfb3088206e0fd +size 14455 diff --git a/saved_model/results/checkpoint-689/scheduler.pt b/saved_model/results/checkpoint-689/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d420c6c722dcc8ae27106b3a08605d1943d21513 --- /dev/null +++ b/saved_model/results/checkpoint-689/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e024ff72a2b6b9733d5dba520ef07b01c55348a3f5fe75a82cb49c1b3587647 +size 1465 diff --git a/saved_model/results/checkpoint-689/special_tokens_map.json b/saved_model/results/checkpoint-689/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f018c31f8fbacc2583f66c4f9d486c3987e18720 --- /dev/null +++ b/saved_model/results/checkpoint-689/special_tokens_map.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c3507f36dff57bce437223db3b3081d1e2b52ec3e56ee55438193ecb2c94dd6 +size 132 diff --git a/saved_model/results/checkpoint-689/tokenizer.json b/saved_model/results/checkpoint-689/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..fdeda05537d802dc152c5710d90fe4f9d39a35f8 --- /dev/null +++ b/saved_model/results/checkpoint-689/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da0e79933b9ed51798a3ae27893d3c5fa4a201126cef75586296df9b4d2c62a0 +size 711661 diff --git a/saved_model/results/checkpoint-689/tokenizer_config.json b/saved_model/results/checkpoint-689/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d79f110f85a86b2429972e257c10f51a2fff4677 --- /dev/null +++ b/saved_model/results/checkpoint-689/tokenizer_config.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e86d243cbe5f8327d7eb8dda2415dce752b0e18da1a1b5e417a856647a06a3c2 +size 409 diff --git a/saved_model/results/checkpoint-689/trainer_state.json b/saved_model/results/checkpoint-689/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..54f15cf28bc30844e2388ad39d6275d071a44309 --- /dev/null +++ b/saved_model/results/checkpoint-689/trainer_state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2e7b2497b9241850c4de52227d0294d7898c089682b02570eb3798ebce50b72 +size 813 diff --git a/saved_model/results/checkpoint-689/training_args.bin b/saved_model/results/checkpoint-689/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..983a7d59005b1a8c52b27ba4479a2d41c4415720 --- /dev/null +++ b/saved_model/results/checkpoint-689/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fd3afd0bd610815f9b7237ec63bc18799fa4677567d19f3a2cf1d051aff770e +size 4817 diff --git a/saved_model/results/checkpoint-689/vocab.txt b/saved_model/results/checkpoint-689/vocab.txt new file mode 100644 index 0000000000000000000000000000000000000000..cada3e340cf0ce35daff62c70d0fb6b705c64dea --- /dev/null +++ b/saved_model/results/checkpoint-689/vocab.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3 +size 231508 diff --git a/saved_model/special_tokens_map.json b/saved_model/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f018c31f8fbacc2583f66c4f9d486c3987e18720 --- /dev/null +++ b/saved_model/special_tokens_map.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c3507f36dff57bce437223db3b3081d1e2b52ec3e56ee55438193ecb2c94dd6 +size 132 diff --git a/saved_model/tokenizer.json b/saved_model/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..fdeda05537d802dc152c5710d90fe4f9d39a35f8 --- /dev/null +++ b/saved_model/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da0e79933b9ed51798a3ae27893d3c5fa4a201126cef75586296df9b4d2c62a0 +size 711661 diff --git a/saved_model/tokenizer_config.json b/saved_model/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d79f110f85a86b2429972e257c10f51a2fff4677 --- /dev/null +++ b/saved_model/tokenizer_config.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e86d243cbe5f8327d7eb8dda2415dce752b0e18da1a1b5e417a856647a06a3c2 +size 409 diff --git a/saved_model/vocab.txt b/saved_model/vocab.txt new file mode 100644 index 0000000000000000000000000000000000000000..cada3e340cf0ce35daff62c70d0fb6b705c64dea --- /dev/null +++ b/saved_model/vocab.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3 +size 231508 diff --git a/tiny_finbert.py b/tiny_finbert.py new file mode 100644 index 0000000000000000000000000000000000000000..6a082360b5634dbec8569ab9eda551a65095965e --- /dev/null +++ b/tiny_finbert.py @@ -0,0 +1,302 @@ +import os +import torch +import pandas as pd +from datasets import Dataset +from sklearn.model_selection import train_test_split +from sklearn.metrics import mean_squared_error, r2_score +from transformers import AutoTokenizer, Trainer, TrainingArguments, IntervalStrategy +import re +import torch.nn as nn +import torch.nn.functional as F +from transformers import AutoModel, AutoConfig, AutoTokenizer, Trainer, TrainingArguments, IntervalStrategy + +from nltk.corpus import stopwords +import spacy + + +class TinyFinBERTRegressor(nn.Module): + def __init__(self, pretrained_model='huawei-noah/TinyBERT_General_4L_312D'): + super().__init__() + if pretrained_model: + self.config = AutoConfig.from_pretrained(pretrained_model) + self.bert = AutoModel.from_pretrained(pretrained_model, config=self.config) + else: + self.config = AutoConfig() + self.bert = AutoModel(self.config) + self.regressor = nn.Linear(self.config.hidden_size, 1) + + # Manually register the position_ids buffer to avoid missing key error + self.bert.embeddings.register_buffer( + "position_ids", + torch.arange(512).expand((1, -1)), + persistent=False, + ) + + def forward(self, input_ids=None, attention_mask=None, labels=None): + outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask) + cls_output = outputs.last_hidden_state[:, 0] + score = self.regressor(cls_output).squeeze() + loss = F.mse_loss(score, labels) if labels is not None else None + return {'loss': loss, 'score': score} + + +def preprocess_texts(texts): + nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"]) # Speeds up processing + negations = {'no', 'not', 'none', 'nobody', 'nothing', 'neither', 'nowhere', 'never', + 'hardly', 'scarcely', 'barely', "n't", "without", "unless", "nor"} + stop_words = set(stopwords.words('english')) - negations + + processed = [] + for text in texts: + text = text.lower() + text = re.sub(r'[^a-zA-Z\s]', '', text) + doc = nlp(text) + tokens = [ + token.lemma_ for token in doc + if token.lemma_.strip() # token.lemma_ not in stop_words and + ] + processed.append(' '.join(tokens)) + return processed + + +def load_phrasebank(path): + with open(path, 'r', encoding='latin1') as f: + lines = f.readlines() + sents, scores = [], [] + for line in lines: + if '@' in line: + s, l = line.strip().split('@') + score = 0.0 if l.lower() == 'neutral' else (-1.0 if l.lower() == 'negative' else 1.0) + sents.append(s) + scores.append(score) + return pd.DataFrame({'text': sents, 'score': scores}) + + +def load_words_phrases(path): + with open(path, 'r', encoding='latin1') as f: + lines = f.readlines() + data = [] + for line in lines: + line = line.strip() + match = re.search(r',(-?\d+\.?\d*)$', line) + if match: + text = line[:match.start()].strip() + score = float(match.group(1)) + data.append((text, score)) + return pd.DataFrame(data, columns=["text", "score"]) + + +def train_model(phrase_path, words_path, save_path): + os.makedirs(save_path, exist_ok=True) + + # Set device + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f"Using device: {device}") + + phrase_df = load_phrasebank(phrase_path) + words_df = load_words_phrases(words_path) + + phrase_df['text'] = preprocess_texts(phrase_df['text']) + words_df['text'] = preprocess_texts(words_df['text']) + + train_phrase, test_phrase = train_test_split(phrase_df, test_size=0.2, random_state=42) + train_df = pd.concat([train_phrase, words_df]) + test_df = test_phrase.reset_index(drop=True) + + tokenizer = AutoTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D') + + def tokenize(batch): + tokens = tokenizer(batch["text"], padding='max_length', truncation=True, max_length=128) + tokens["labels"] = batch["score"] + return tokens + + train_dataset = Dataset.from_pandas(train_df).map(tokenize, batched=True) + test_dataset = Dataset.from_pandas(test_df).map(tokenize, batched=True) + + args = TrainingArguments( + output_dir=os.path.join(save_path, "results"), + evaluation_strategy=IntervalStrategy.EPOCH, + save_strategy=IntervalStrategy.EPOCH, + learning_rate=2e-5, + per_device_train_batch_size=16, + per_device_eval_batch_size=64, + num_train_epochs=5, + weight_decay=0.01, + load_best_model_at_end=True, + metric_for_best_model="eval_loss" + ) + + model = TinyFinBERTRegressor().to(device) + + trainer = Trainer( + model=model, + args=args, + train_dataset=train_dataset, + eval_dataset=test_dataset, + tokenizer=tokenizer, + compute_metrics=lambda pred: { + "mse": mean_squared_error(pred.label_ids, pred.predictions), + "r2": r2_score(pred.label_ids, pred.predictions) + } + ) + + trainer.train() + + # Save the model and tokenizer + model_to_save = model.module if hasattr(model, 'module') else model # Handle distributed/parallel training + torch.save(model_to_save.state_dict(), os.path.join(save_path, "regressor_model.pt")) + tokenizer.save_pretrained(save_path) + print(f"Model saved to {save_path}") + + +from sklearn.metrics import ( + mean_squared_error, r2_score, + accuracy_score, precision_score, recall_score, f1_score, + roc_auc_score, confusion_matrix, cohen_kappa_score +) +from sklearn.preprocessing import label_binarize + + +def evaluate_model(phrase_path, model_path): + # Set device + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f"Using device: {device}") + + phrase_df = load_phrasebank(phrase_path) + _, test_df = train_test_split(phrase_df, test_size=0.2, random_state=42) + test_df['text'] = preprocess_texts(test_df['text']) + + tokenizer = AutoTokenizer.from_pretrained(model_path) + model = TinyFinBERTRegressor() + model.load_state_dict(torch.load(os.path.join(model_path, "regressor_model.pt"), map_location=device)) + model.to(device) + model.eval() + + y_true, y_pred, y_scores = [], [], [] + + for _, row in test_df.iterrows(): + inputs = tokenizer(row["text"], return_tensors="pt", truncation=True, padding='max_length', max_length=128) + inputs = {k: v.to(device) for k, v in inputs.items() if k != "token_type_ids"} + with torch.no_grad(): + score = model(**inputs)["score"].item() + y_scores.append(score) + y_true.append(row["score"]) + + # regression metrics + mse = mean_squared_error(y_true, y_scores) + r2 = r2_score(y_true, y_scores) + + y_pred = [1 if s > 0.3 else -1 if s < -0.3 else 0 for s in y_scores] + y_true_classes = [int(round(s)) for s in y_true] + + acc = accuracy_score(y_true_classes, y_pred) + prec = precision_score(y_true_classes, y_pred, average='weighted', zero_division=0) + rec = recall_score(y_true_classes, y_pred, average='weighted') + f1 = f1_score(y_true_classes, y_pred, average='weighted') + kappa = cohen_kappa_score(y_true_classes, y_pred) + cm = confusion_matrix(y_true_classes, y_pred) + + y_true_bin = label_binarize(y_true_classes, classes=[-1, 0, 1]) + y_score_bin = label_binarize(y_pred, classes=[-1, 0, 1]) + roc_auc = roc_auc_score(y_true_bin, y_score_bin, average='macro', multi_class='ovo') + + print(f"Sentiment Regression Metrics:") + print(f"- MSE: {mse:.4f}") + print(f"- R²: {r2:.4f}") + print(f"- Accuracy: {acc:.4f}") + print(f"- Precision: {prec:.4f}") + print(f"- Recall: {rec:.4f}") + print(f"- F1 Score: {f1:.4f}") + print(f"- ROC-AUC: {roc_auc:.4f}") + print(f"- Cohen's Kappa: {kappa:.4f}") + print(f"- Confusion Matrix:\n{cm}") + + +def test(model_path): + # Set device + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f"Using device: {device}") + + tokenizer = AutoTokenizer.from_pretrained(model_path) + model = TinyFinBERTRegressor() + model.load_state_dict(torch.load(os.path.join(model_path, "regressor_model.pt"), map_location=device)) + model.to(device) + model.eval() + + texts = [ + "The company's earnings exceeded expectations.", + "They faced major losses this quarter.", + "They didn't face major losses this quarter.", + "Stock prices remained the same.", + "boost", + "strong boost", + "AMD was not able to reduce losses.", + "AMD reduced debt significantly, improves balance sheet", + "Economic indicators point to contraction in telecom sector", + "Company didn't have increased losses over last years." + ] + + for text in texts: + clean_text = preprocess_texts([text])[0] + print(f"Original Text: {text}") + print(f"Processed Text: {clean_text}") + + tokens = tokenizer.tokenize(clean_text) + print(f"Tokens: {tokens}") + + inputs = tokenizer(clean_text, return_tensors="pt", truncation=True, padding='max_length', max_length=128) + inputs = {k: v.to(device) for k, v in inputs.items() if k != "token_type_ids"} + + with torch.no_grad(): + score = model(**inputs)["score"].item() + + print(f"Predicted Sentiment Score: {score:.3f}") + sentiment = "positive" if score > 0.3 else "negative" if score < -0.3 else "neutral" + print(f"Sentiment: {sentiment}\n") + + +def init_model(): + """Function to properly initialize model with position_ids regardless of whether it's being loaded or created new""" + model = TinyFinBERTRegressor() + + # Make sure position_ids is registered + if not hasattr(model.bert.embeddings, 'position_ids'): + model.bert.embeddings.register_buffer( + "position_ids", + torch.arange(512).expand((1, -1)), + persistent=False, + ) + return model + + +def create_api_model(model_path): + """Create a model suitable for a FastAPI application""" + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + tokenizer = AutoTokenizer.from_pretrained(model_path) + + # Initialize model with position_ids properly registered + model = init_model() + model.load_state_dict(torch.load(os.path.join(model_path, "regressor_model.pt"), map_location=device)) + model.to(device) + model.eval() + + return model, tokenizer, device + + +if __name__ == "__main__": + model_dir = "./saved_model" + phrase_path = "./Sentences_50Agree.txt" + words_path = "./financial_sentiment_words_phrases_negations.csv" + + # Check for GPU availability + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f"Using device: {device}") + + if not os.path.isfile(os.path.join(model_dir, "regressor_model.pt")): + print("Training new model...") + train_model(phrase_path, words_path, model_dir) + else: + print(f"Model found at {os.path.join(model_dir, 'regressor_model.pt')}") + + evaluate_model(phrase_path, model_dir) + test(model_dir) \ No newline at end of file