File size: 3,228 Bytes
3d53082
 
3b9fb2c
3d53082
 
 
 
 
 
3b9fb2c
3d53082
 
 
 
 
 
 
 
 
c35975c
 
3d53082
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b9fb2c
3d53082
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c35975c
 
3d53082
 
c35975c
306e33b
 
 
c35975c
3d53082
306e33b
 
 
3d53082
 
306e33b
 
3d53082
306e33b
 
 
 
c35975c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
import json
import gradio as gr
import torch
import spaces
from gliner import GLiNER
from gliner.multitask import GLiNERRelationExtractor
from typing import List, Dict, Any, Tuple
from tqdm.auto import tqdm

# Configuration
data_model_id = "rafmacalaba/gliner_re_finetuned-v3"
CACHE_DIR = os.environ.get("CACHE_DIR", None)

# Relation types
trels = [
    'acronym', 'author', 'data description',
    'data geography', 'data source', 'data type',
    'publication year', 'publisher', 'reference year', 'version'
]

# Map NER labels to relation types
TYPE2RELS = {
    "named dataset":   trels,
    "unnamed dataset": trels,
    "vague dataset":   trels,
}

# Load models
print("Loading NER+RE model...")
model = GLiNER.from_pretrained(data_model_id, cache_dir=CACHE_DIR)
relation_extractor = GLiNERRelationExtractor(model=model)
if torch.cuda.is_available():
    model.to("cuda")
    relation_extractor.model.to("cuda")
print("Models loaded.")

# Inference pipeline
def inference_pipeline(
    text: str,
    model,
    labels: List[str],
    relation_extractor: GLiNERRelationExtractor,
    TYPE2RELS: Dict[str, List[str]],
    ner_threshold: float = 0.5,
    re_threshold: float = 0.4,
    re_multi_label: bool = False,
) -> Tuple[List[Dict[str, Any]], Dict[str, List[Dict[str, Any]]]]:
    # NER predictions
    ner_preds = model.predict_entities(
        text,
        labels,
        flat_ner=True,
        threshold=ner_threshold
    )

    # Relation extraction per entity span
    re_results: Dict[str, List[Dict[str, Any]]] = {}
    for ner in ner_preds:
        span = ner['text']
        rel_types = TYPE2RELS.get(ner['label'], [])
        if not rel_types:
            continue
        slot_labels = [f"{span} <> {r}" for r in rel_types]
        preds = relation_extractor(
            text,
            relations=None,
            entities=None,
            relation_labels=slot_labels,
            threshold=re_threshold,
            multi_label=re_multi_label,
            distance_threshold=100,
        )[0]
        re_results[span] = preds

    return ner_preds, re_results

# Gradio UI - Step 2: Model Inference
@spaces.GPU(enable_queue=True, duration=120)
def model_inference(query: str) -> str:
    labels = ["named dataset", "unnamed dataset", "vague dataset"]
    ner_preds, re_results = inference_pipeline(
        query,
        model,
        labels,
        relation_extractor,
        TYPE2RELS
    )
    output = {
        "entities": ner_preds,
        "relations": re_results,
    }
    return json.dumps(output, indent=2)

with gr.Blocks(title="Step 2: NER + Relation Inference") as demo:
    gr.Markdown(
        """
        ## Step 2: Integrate Model Inference
        Enter text and click submit to run your GLiNER-based NER + RE pipeline.
        """
    )
    query_input = gr.Textbox(
        lines=4,
        placeholder="Type your text here...",
        label="Input Text",
    )
    submit_btn = gr.Button("Submit")
    output_box = gr.Textbox(
        lines=15,
        label="Model Output (JSON)",
    )
    submit_btn.click(
        fn=model_inference,
        inputs=[query_input],
        outputs=[output_box],
    )

if __name__ == "__main__":
    demo.launch(debug=True)