Spaces:

Kamichanw
/

CIDEr

Runtime error

File size: 4,930 Bytes

f19b88e

from typing import List
import datasets
import evaluate
import os
import tempfile
import subprocess

from pycocoevalcap.cider.cider import CiderScorer, Cider

_DESCRIPTION = """
The CIDEr (Consensus-based Image Description Evaluation) metric is used to evaluate the quality of image captions generated by models in image captioning tasks. 
It measures how well the generated caption matches human-written reference captions by considering both the frequency and the relevance of words or phrases.
Here is the formula for the CIDEr metric in LaTeX code:

$
\\text{CIDEr}(c_i, C) = \\frac{1}{N} \\sum_{n=1}^{N} w_n \\cdot \\frac{\\sum_{j=1}^{m} \\text{IDF}(g_j) \\cdot \\text{TF}(g_j, c_i)}{\\sum_{j=1}^{m} \\text{IDF}(g_j) \\cdot \\text{TF}(g_j, C)}
$

where:
- $ c_i $ is the candidate caption,
- $ C $ is the set of reference captions,
- $ N $ is the number of n-grams (typically 1 to 4),
- $ w_n $ is the weight for the n-gram,
- $ g_j $ represents the j-th n-gram,
- $ \\text{TF}(g_j, c_i) $ is the term frequency of the n-gram $ g_j $ in the candidate caption $ c_i $,
- $ \\text{TF}(g_j, C) $ is the term frequency of the n-gram $ g_j $ in the reference captions $ C $,
- $ \\text{IDF}(g_j) $ is the inverse document frequency of the n-gram $ g_j $.
"""


_KWARGS_DESCRIPTION = """
Args:
    predictions (`list` of `str`): Predicted captions.
    references (`list` of `str` lists): Ground truth captions. 
    n (int, defaults to 4): Number of ngrams for which (ngram) representation is calculated.
    sigma (float, defaults to 6.0): The standard deviation parameter for gaussian penalty.

Returns:
    CIDEr (`float`): CIDEr value. Minimum possible value is 0. Maximum possible value is 100.

"""


_CITATION = """
@inproceedings{vedantam2015cider,
  title={Cider: Consensus-based image description evaluation},
  author={Vedantam, Ramakrishna and Lawrence Zitnick, C and Parikh, Devi},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={4566--4575},
  year={2015}
}
"""

_URLS = {
    "stanford-corenlp": "https://repo1.maven.org/maven2/edu/stanford/nlp/stanford-corenlp/3.4.1/stanford-corenlp-3.4.1.jar"
}


def tokenize(tokenizer_path: str, predictions: List[str], references: List[List[str]]):
    PUNCTUATIONS = [
        "''",
        "'",
        "``",
        "`",
        "-LRB-",
        "-RRB-",
        "-LCB-",
        "-RCB-",
        ".",
        "?",
        "!",
        ",",
        ":",
        "-",
        "--",
        "...",
        ";",
    ]

    cmd = [
        "java",
        "-cp",
        tokenizer_path,
        "edu.stanford.nlp.process.PTBTokenizer",
        "-preserveLines",
        "-lowerCase",
    ]

    sentences = "\n".join(
        [
            s.replace("\n", " ")
            for s in predictions + [ref for refs in references for ref in refs]
        ]
    )

    with tempfile.NamedTemporaryFile(delete=False) as f:
        f.write(sentences.encode())

    cmd.append(f.name)
    p_tokenizer = subprocess.Popen(cmd, stdout=subprocess.PIPE)
    token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0]
    token_lines = token_lines.decode()
    lines = [
        " ".join([w for w in line.rstrip().split(" ") if w not in PUNCTUATIONS])
        for line in token_lines.split("\n")
    ]

    os.remove(f.name)

    pred_size = len(predictions)
    ref_sizes = [len(ref) for ref in references]

    predictions = lines[:pred_size]
    start = pred_size
    references = []
    for size in ref_sizes:
        references.append(lines[start : start + size])
        start += size

    return predictions, references


@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class CIDEr(evaluate.Metric):
    def _info(self):
        return evaluate.MetricInfo(
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            features=datasets.Features(
                {
                    "predictions": datasets.Value("string", id="sequence"),
                    "references": datasets.Sequence(
                        datasets.Value("string", id="sequence"), id="references"
                    ),
                }
            ),
            reference_urls=[
                "https://github.com/salaniz/pycocoevalcap",
                "https://github.com/tylin/coco-caption",
            ],
        )

    def _download_and_prepare(self, dl_manager):
        self.tokenizer_path = dl_manager.download(_URLS["stanford-corenlp"])

    def _compute(self, predictions, references, n=4, sigma=6.0):
        predications, references = tokenize(
            self.tokenizer_path, predictions, references
        )
        scorer = CiderScorer(n, sigma)
        for pred, refs in zip(predications, references):
            scorer += (pred, refs)
        score, scores = scorer.compute_score()
        return {"CIDEr": score}