Spaces:

ai4data
/

datause-detector

Running

App Files Files Community

datause-detector / app.py

rafmacalaba

change model

cce0575 about 5 hours ago

raw

history blame contribute delete

7.95 kB

	import os
	import re
	import json
	from collections import defaultdict
	import gradio as gr
	from typing import List, Dict, Any, Tuple
	# Load environment variable for cache dir (useful on Spaces)
	_CACHE_DIR = os.environ.get("CACHE_DIR", None)

	# Import GLiNER model and relation extractor
	from gliner import GLiNER
	#from relation_extraction import CustomGLiNERRelationExtractor

	# Cache and initialize model + relation extractor
	DATA_MODEL_ID = "rafmacalaba/gliner_re_finetuned-v7-pos"
	model = GLiNER.from_pretrained(DATA_MODEL_ID, cache_dir=_CACHE_DIR)
	from relation_extraction import CustomGLiNERRelationExtractor
	relation_extractor = CustomGLiNERRelationExtractor(model=model, return_index=True)

	# Sample text
	SAMPLE_TEXT = (
	"Encuesta Nacional de Hogares (ENAHO) is the Peruvian version of the Living Standards Measurement Survey, e.g. a nationally representative household survey collected monthly on a continuous basis. For our analysis, we use data from January 2007 to December 2020. The survey covers a wide variety of topics, including basic demographics, educational background, labor market conditions, crime victimization, and a module on respondent’s perceptions about the main problems in the country and trust in different local and national‐level institutions."
	)

	# Post-processing: prune acronyms and self-relations

	labels = ['named dataset', 'unnamed dataset', 'vague dataset']
	rels = ['acronym', 'author', 'data description', 'data geography', 'data source', 'data type', 'publication year', 'publisher', 'reference population', 'reference year', 'version']

	TYPE2RELS = {
	"named dataset": rels,
	"unnamed dataset": rels,
	"vague dataset": rels,
	}

	def inference_pipeline(
	text: str,
	model,
	labels: List[str],
	relation_extractor: CustomGLiNERRelationExtractor,
	TYPE2RELS: Dict[str, List[str]],
	ner_threshold: float = 0.7,
	rel_threshold: float = 0.5,
	re_multi_label: bool = False,
	return_index: bool = False,
	) -> Tuple[List[Dict[str, Any]], Dict[str, List[Dict[str, Any]]]]:
	ner_preds = model.predict_entities(
	text,
	labels,
	flat_ner=True,
	threshold=ner_threshold
	)

	re_results: Dict[str, List[Dict[str, Any]]] = {}
	for ner in ner_preds:
	span = ner['text']
	rel_types = TYPE2RELS.get(ner['label'], [])
	if not rel_types:
	continue

	slot_labels = [f"{span} <> {r}" for r in rel_types]

	preds = relation_extractor(
	text,
	relations=None,
	entities=None,
	relation_labels=slot_labels,
	threshold=rel_threshold,
	multi_label=re_multi_label,
	return_index=return_index,
	)[0]

	re_results[span] = preds

	return ner_preds, re_results

	def prune_acronym_and_self_relations(ner_preds, rel_preds):
	# 1) Find acronym targets strictly shorter than their source
	acronym_targets = {
	r["target"]
	for src, rels in rel_preds.items()
	for r in rels
	if r["relation"] == "acronym" and len(r["target"]) < len(src)
	}

	# 2) Filter NER: drop any named-dataset whose text is in that set
	filtered_ner = [
	ent for ent in ner_preds
	if not (ent["label"] == "named dataset" and ent["text"] in acronym_targets)
	]

	# 3) Filter RE: drop blocks for acronym sources, and self-relations
	filtered_re = {}
	for src, rels in rel_preds.items():
	if src in acronym_targets:
	continue
	kept = [r for r in rels if r["target"] != src]
	if kept:
	filtered_re[src] = kept

	return filtered_ner, filtered_re

	# Highlighting function

	def highlight_text(text, ner_threshold, rel_threshold):
	# 1) Inference
	ner_preds, rel_preds = inference_pipeline(
	text,
	model=model,
	labels=labels,
	relation_extractor=relation_extractor,
	TYPE2RELS=TYPE2RELS,
	ner_threshold=ner_threshold,
	rel_threshold=rel_threshold,
	re_multi_label=False,
	return_index=True,
	)
	ner_preds, rel_preds = prune_acronym_and_self_relations(ner_preds, rel_preds)

	# 2) Compute how long the RE prompt prefix is
	# This must match exactly what your extractor prepends:
	prefix = f"{relation_extractor.prompt} \n "
	prefix_len = len(prefix)

	# 3) Gather spans
	spans = []
	for ent in ner_preds:
	spans.append((ent["start"], ent["end"], ent["label"]))

	# Use the extractor‐returned start/end, minus prefix_len
	for src, rels in rel_preds.items():
	for r in rels:
	# adjust the indices back onto the raw text
	s = r["start"] - prefix_len
	e = r["end"] - prefix_len
	# skip anything that fell outside
	if s < 0 or e < 0:
	continue
	label = f"{r['source']} <> {r['relation']}"
	spans.append((s, e, label))

	# 4) Merge & build entities (same as before)
	merged = defaultdict(list)
	for s, e, lbl in spans:
	merged[(s, e)].append(lbl)

	entities = []
	for (s, e), lbls in sorted(merged.items(), key=lambda x: x[0]):
	entities.append({
	"entity": ", ".join(lbls),
	"start": s,
	"end": e
	})

	return {"text": text, "entities": entities}, {"ner": ner_preds, "relations": rel_preds}

	# JSON output function
	def _cached_predictions(state):
	if not state:
	return "📋 No predictions yet. Click Submit first."
	return json.dumps(state, indent=2)

	with gr.Blocks() as demo:
	gr.Markdown(f"""# Data Use Detector

	This Space demonstrates our fine-tuned GLiNER model’s ability to spot dataset mentions and relations in any input text. It identifies dataset names via NER, then extracts relations such as publisher, acronym, publication year, data geography, and more.

	How it works
	1. NER: Recognizes dataset names in your text.
	2. RE: Links each dataset to its attributes (e.g., publisher, year, acronym).
	3. Visualization: Highlights entities and relation spans inline.

	Instructions
	1. Paste or edit your text in the box below.
	2. Tweak the NER & RE confidence sliders.
	3. Click Submit to see highlights.
	4. Click Get Model Predictions to view the raw JSON output.

	Resources
	- Model: [{DATA_MODEL_ID}](https://huggingface.co/{DATA_MODEL_ID})
	- Paper: _Large Language Models and Synthetic Data for Monitoring Dataset Mentions in Research Papers_ – ArXiv: [2502.10263](https://arxiv.org/pdf/2502.10263)
	- [GLiNER GitHub Repo](https://github.com/urchade/GLiNER)
	- [Project Docs](https://worldbank.github.io/ai4data-use/docs/introduction.html)
	""")


	txt_in = gr.Textbox(
	label="Input Text",
	lines=4,
	value=SAMPLE_TEXT
	)

	ner_slider = gr.Slider(
	0, 1, value=0.7, step=0.01,
	label="NER Threshold",
	info="Minimum confidence for named-entity spans."
	)
	re_slider = gr.Slider(
	0, 1, value=0.5, step=0.01,
	label="RE Threshold",
	info="Minimum confidence for relation extractions."
	)

	highlight_btn = gr.Button("Submit")
	txt_out = gr.HighlightedText(label="Annotated Entities")

	get_pred_btn = gr.Button("Get Model Predictions")
	json_out = gr.Textbox(label="Model Predictions (JSON)", lines=15)
	state = gr.State()
	# Wire up interactions
	highlight_btn.click(
	fn=highlight_text,
	inputs=[txt_in, ner_slider, re_slider],
	outputs=[txt_out, state]
	)
	get_pred_btn.click(
	fn=_cached_predictions,
	inputs=[state],
	outputs=[json_out]
	)

	# Enable queue for concurrency
	demo.queue(default_concurrency_limit=5)

	# Launch the app

	demo.launch(debug=True, inline=True)