sescore_english_coco

Runtime error

App Files Files Community

xu1998hz commited on Mar 4, 2023

Commit

9bab39e

0 Parent(s):

Duplicate from xu1998hz/sescore

Browse files

Files changed (8) hide show

.gitattributes +33 -0
README.md +46 -0
app.py +73 -0
description.md +59 -0
img/logo_sescore.png +0 -0
requirements.txt +3 -0
sescore.py +139 -0
tests.py +17 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,33 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,46 @@

+---
+title: SEScore
+datasets:
+- null
+tags:
+- evaluate
+- metric
+description: 'SEScore: a text generation evaluation metric'
+sdk: gradio
+sdk_version: 3.0.2
+app_file: app.py
+pinned: false
+duplicated_from: xu1998hz/sescore
+---
+# Metric Card for SEScore
+![alt text](https://huggingface.co/spaces/xu1998hz/sescore/blob/main/img/logo_sescore.png)
+## Metric Description
+*SEScore is an unsupervised learned evaluation metric trained on synthesized dataset*
+## How to Use
+*Provide simplest possible example for using the metric*
+### Inputs
+*SEScore takes input of predictions (a list of candidate translations) and references (a list of reference translations).*
+### Output Values
+*Output value is between 0 to -25*
+#### Values from Popular Papers
+### Examples
+*Give code examples of the metric being used. Try to include examples that clear up any potential ambiguity left from the metric description above. If possible, provide a range of examples that show both typical and atypical results, as well as examples where a variety of input parameters are passed.*
+## Limitations and Bias
+*Note any known limitations or biases that the metric has, with links and references if possible.*
+## Citation
+*Cite the source where this metric was introduced.*
+## Further References
+*Add any useful further references.*

app.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import evaluate
+import sys
+from pathlib import Path
+from evaluate.utils import infer_gradio_input_types, json_to_string_type, parse_readme, parse_gradio_data, parse_test_cases
+def launch_gradio_widget(metric):
+    """Launches `metric` widget with Gradio."""
+    try:
+        import gradio as gr
+    except ImportError as error:
+        logger.error("To create a metric widget with Gradio make sure gradio is installed.")
+        raise error
+    local_path = Path(sys.path[0])
+    # if there are several input types, use first as default.
+    if isinstance(metric.features, list):
+        (feature_names, feature_types) = zip(*metric.features[0].items())
+    else:
+        (feature_names, feature_types) = zip(*metric.features.items())
+    gradio_input_types = infer_gradio_input_types(feature_types)
+    def compute(data):
+        return metric.compute(**parse_gradio_data(data, gradio_input_types))
+    header_html = '''<div style="max-width:800px; margin:auto; float:center; margin-top:0; margin-bottom:0; padding:0;">
+            <img src="https://huggingface.co/spaces/xu1998hz/sescore/resolve/main/img/logo_sescore.png" style="margin:0; padding:0; margin-top:-10px; margin-bottom:-50px;">
+        </div>
+        <h2 style='margin-top: 5pt; padding-top:10pt;'>About <i>SEScore</i></h2>
+        <p><b>SEScore</b> is a reference-based text-generation evaluation metric that requires no pre-human-annotated error data,
+        described in our paper <a href="https://arxiv.org/abs/2210.05035"><b>"Not All Errors are Equal: Learning Text Generation Metrics using
+        Stratified Error Synthesis"</b></a> from EMNLP 2022.</p>
+        <p>Its effectiveness over prior methods like BLEU, BERTScore, BARTScore, PRISM, COMET and BLEURT has been demonstrated on a diverse set of language generation tasks, including
+        translation, captioning, and web text generation. <a href="https://twitter.com/LChoshen/status/1580136005654700033">Readers have even described SEScore as "one unsupervised evaluation to rule them all"</a>
+        and we are very excited to share it with you!</p>
+        <h2 style='margin-top: 10pt; padding-top:0;'>Try it yourself!</h2>
+        <p>Provide sample (gold) reference text and (model output) predicted text below and see how SEScore rates them! It is most performant
+        in a relative ranking setting, so in general <b>it will rank better predictions higher than worse ones.</b> Providing useful
+        absolute numbers based on SEScore is an ongoing direction of investigation.</p>
+    '''.replace('\n',' ')
+    tail_markdown = parse_readme(local_path / "description.md")
+    iface = gr.Interface(
+        fn=compute,
+        inputs=gr.inputs.Dataframe(
+            headers=feature_names,
+            col_count=len(feature_names),
+            row_count=2,
+            datatype=json_to_string_type(gradio_input_types),
+        ),
+        outputs=gr.outputs.Textbox(label=metric.name),
+        description=header_html,
+        #title=f"SEScore Metric Usage Example",
+        article=tail_markdown,
+        # TODO: load test cases and use them to populate examples
+        # examples=[parse_test_cases(test_cases, feature_names, gradio_input_types)]
+    )
+    print(dir(iface))
+    iface.launch()
+module = evaluate.load("xu1998hz/sescore")
+launch_gradio_widget(module)

description.md ADDED Viewed

	@@ -0,0 +1,59 @@

+## Installation and usage
+```bash
+pip install -r requirements.txt
+```
+Minimal example (evaluating English text generation)
+```python
+import evaluate
+sescore = evaluate.load("xu1998hz/sescore")
+score = sescore.compute(
+    references=['sescore is a simple but effective next-generation text evaluation metric'],
+    predictions=['sescore is simple effective text evaluation metric for next generation']
+)
+```
+*SEScore* compares a list of references (gold translation/generated output examples) with a same-length list of candidate generated samples. Currently, the output range is learned and scores are most useful in relative ranking scenarios rather than absolute comparisons. We are producing a series of rescaling options to make absolute SEScore-based scaling more effective.
+### Available pre-trained models
+Currently, the following language/model pairs are available:
+| Language | pretrained data | pretrained model link |
+|----------|-----------------|-----------------------|
+| English  | MT              | [xu1998hz/sescore_english_mt](https://huggingface.co/xu1998hz/sescore_english_mt) |
+| German   | MT              | [xu1998hz/sescore_german_mt](https://huggingface.co/xu1998hz/sescore_german_mt) |
+| English  | webNLG17        | [xu1998hz/sescore_english_webnlg17](https://huggingface.co/xu1998hz/sescore_english_webnlg17) |
+| English  | CoCo captions   | [xu1998hz/sescore_english_coco](https://huggingface.co/xu1998hz/sescore_english_coco) |
+Please contact repo maintainer Wenda Xu to add your models!
+## Limitations
+*SEScore* is trained on synthetic data in-domain.
+Although this data is generated to simulate user-relevant errors like deletion and spurious insertion, it may be limited in its ability to simulate humanlike errors.
+Model applicability is domain-specific (e.g., CoCo caption-trained model will be better for captioning than MT-trained).
+We are in the process of producing and benchmarking general language-level *SEScore* variants.
+## Citation
+If you find our work useful, please cite the following:
+```bibtex
+@inproceedings{xu-etal-2022-not,
+  title={Not All Errors are Equal: Learning Text Generation Metrics using Stratified Error Synthesis},
+  author={Xu, Wenda and Tuan, Yi-lin and Lu, Yujie and Saxon, Michael and Li, Lei and Wang, William Yang},
+  booktitle ={Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing},
+  month={dec},
+  year={2022},
+  url={https://arxiv.org/abs/2210.05035}
+}
+```
+## Acknowledgements
+The work of the [COMET](https://github.com/Unbabel/COMET) maintainers at [Unbabel](https://duckduckgo.com/?t=ffab&q=unbabel&ia=web) has been instrumental in producing SEScore.

img/logo_sescore.png ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+git+https://github.com/huggingface/evaluate@main
+unbabel-comet
+torch

sescore.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""SEScore: a text generation evaluation metric """
+import evaluate
+import datasets
+import comet
+from typing import Dict
+import torch
+from comet.encoders.base import Encoder
+from comet.encoders.bert import BERTEncoder
+from transformers import AutoModel, AutoTokenizer
+class robertaEncoder(BERTEncoder):
+    def __init__(self, pretrained_model: str) -> None:
+        super(Encoder, self).__init__()
+        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
+        self.model = AutoModel.from_pretrained(
+            pretrained_model, add_pooling_layer=False
+        )
+        self.model.encoder.output_hidden_states = True
+    @classmethod
+    def from_pretrained(cls, pretrained_model: str) -> Encoder:
+        return robertaEncoder(pretrained_model)
+    def forward(
+        self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs
+    ) -> Dict[str, torch.Tensor]:
+        last_hidden_states, _, all_layers = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=True,
+            return_dict=False,
+        )
+        return {
+            "sentemb": last_hidden_states[:, 0, :],
+            "wordemb": last_hidden_states,
+            "all_layers": all_layers,
+            "attention_mask": attention_mask,
+        }
+# TODO: Add BibTeX citation
+_CITATION = """\
+@inproceedings{xu-etal-2022-not,
+  title={Not All Errors are Equal: Learning Text Generation Metrics using Stratified Error Synthesis},
+  author={Xu, Wenda and Tuan, Yi-lin and Lu, Yujie and Saxon, Michael and Li, Lei and Wang, William Yang},
+  booktitle ={Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing},
+  month={dec},
+  year={2022},
+  url={https://arxiv.org/abs/2210.05035}
+}
+"""
+_DESCRIPTION = """\
+SEScore is an evaluation metric that trys to compute an overall score to measure text generation quality.
+"""
+_KWARGS_DESCRIPTION = """
+Calculates how good are predictions given some references
+Args:
+    predictions: list of candidate outputs
+    references: list of references
+Returns:
+    {"mean_score": mean_score, "scores": scores}
+Examples:
+    >>> import evaluate
+    >>> sescore = evaluate.load("xu1998hz/sescore")
+    >>> score = sescore.compute(
+            references=['sescore is a simple but effective next-generation text evaluation metric'],
+            predictions=['sescore is simple effective text evaluation metric for next generation']
+        )
+"""
+# TODO: Define external resources urls if needed
+BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class SEScore(evaluate.Metric):
+    """SEScore"""
+    def _info(self):
+        # TODO: Specifies the evaluate.EvaluationModuleInfo object
+        return evaluate.MetricInfo(
+            # This is the description that will appear on the modules page.
+            module_type="metric",
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            # This defines the format of each prediction and reference
+            features=datasets.Features({
+                'predictions': datasets.Value("string", id="sequence"),
+                'references': datasets.Value("string", id="sequence"),
+            }),
+            # Homepage of the module for documentation
+            homepage="http://module.homepage",
+            # Additional links to the codebase or references
+            codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
+            reference_urls=["http://path.to.reference.url/new_module"]
+        )
+    def _download_and_prepare(self, dl_manager):
+        """download SEScore checkpoints to compute the scores"""
+        # Download SEScore checkpoint
+        from comet import load_from_checkpoint
+        import os
+        from huggingface_hub import snapshot_download
+        # initialize roberta into str2encoder
+        comet.encoders.str2encoder['RoBERTa'] = robertaEncoder
+        print("config name: ", self.config_name)
+        if self.config_name == "default":
+            destination = snapshot_download(repo_id="xu1998hz/sescore_english_mt", revision="main")
+            self.scorer = load_from_checkpoint(f'{destination}/checkpoint/sescore_english_mt.ckpt')
+        else:
+            print("Config name is not supported!")
+    def _compute(self, predictions, references, gpus=None, progress_bar=False):
+        if gpus is None:
+            gpus = 1 if torch.cuda.is_available() else 0
+        data = {"src": references, "mt": predictions}
+        data = [dict(zip(data, t)) for t in zip(*data.values())]
+        scores, mean_score = self.scorer.predict(data, gpus=gpus, progress_bar=progress_bar)
+        return {"mean_score": mean_score, "scores": scores}

tests.py ADDED Viewed

	@@ -0,0 +1,17 @@

+test_cases = [
+    {
+        "predictions": [0, 0],
+        "references": [1, 1],
+        "result": {"metric_score": 0}
+    },
+    {
+        "predictions": [1, 1],
+        "references": [1, 1],
+        "result": {"metric_score": 1}
+    },
+    {
+        "predictions": [1, 0],
+        "references": [1, 1],
+        "result": {"metric_score": 0.5}
+    }
+]