from typing import List import datasets import evaluate import os import tempfile import subprocess from pycocoevalcap.cider.cider import CiderScorer _DESCRIPTION = """ The CIDEr (Consensus-based Image Description Evaluation) metric is used to evaluate the quality of image captions generated by models in image captioning tasks. It measures how well the generated caption matches human-written reference captions by considering both the frequency and the relevance of words or phrases. Here is the formula for the CIDEr metric in LaTeX code: $ \\text{CIDEr}(c_i, C) = \\frac{1}{N} \\sum_{n=1}^{N} w_n \\cdot \\frac{\\sum_{j=1}^{m} \\text{IDF}(g_j) \\cdot \\text{TF}(g_j, c_i)}{\\sum_{j=1}^{m} \\text{IDF}(g_j) \\cdot \\text{TF}(g_j, C)} $ where: - $ c_i $ is the candidate caption, - $ C $ is the set of reference captions, - $ N $ is the number of n-grams (typically 1 to 4), - $ w_n $ is the weight for the n-gram, - $ g_j $ represents the j-th n-gram, - $ \\text{TF}(g_j, c_i) $ is the term frequency of the n-gram $ g_j $ in the candidate caption $ c_i $, - $ \\text{TF}(g_j, C) $ is the term frequency of the n-gram $ g_j $ in the reference captions $ C $, - $ \\text{IDF}(g_j) $ is the inverse document frequency of the n-gram $ g_j $. """ _KWARGS_DESCRIPTION = """ Args: predictions (`list` of `str`): Predicted captions. references (`list` of `str` lists): Ground truth captions. n (int, defaults to 4): Number of ngrams for which (ngram) representation is calculated. sigma (float, defaults to 6.0): The standard deviation parameter for gaussian penalty. Returns: CIDEr (`float`): CIDEr value. Minimum possible value is 0. Maximum possible value is 100. """ _CITATION = """ @inproceedings{vedantam2015cider, title={Cider: Consensus-based image description evaluation}, author={Vedantam, Ramakrishna and Lawrence Zitnick, C and Parikh, Devi}, booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, pages={4566--4575}, year={2015} } """ _URLS = { "stanford-corenlp": "https://repo1.maven.org/maven2/edu/stanford/nlp/stanford-corenlp/3.4.1/stanford-corenlp-3.4.1.jar" } def tokenize(tokenizer_path: str, predictions: List[str], references: List[List[str]]): PUNCTUATIONS = [ "''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", ".", "?", "!", ",", ":", "-", "--", "...", ";", ] cmd = [ "java", "-cp", tokenizer_path, "edu.stanford.nlp.process.PTBTokenizer", "-preserveLines", "-lowerCase", ] sentences = "\n".join( [ s.replace("\n", " ") for s in predictions + [ref for refs in references for ref in refs] ] ) with tempfile.NamedTemporaryFile(delete=False) as f: f.write(sentences.encode()) cmd.append(f.name) p_tokenizer = subprocess.Popen(cmd, stdout=subprocess.PIPE) token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0] token_lines = token_lines.decode() lines = [ " ".join([w for w in line.rstrip().split(" ") if w not in PUNCTUATIONS]) for line in token_lines.split("\n") ] os.remove(f.name) pred_size = len(predictions) ref_sizes = [len(ref) for ref in references] predictions = lines[:pred_size] start = pred_size references = [] for size in ref_sizes: references.append(lines[start : start + size]) start += size return predictions, references @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class CIDEr(evaluate.Metric): def _info(self): return evaluate.MetricInfo( description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, features=datasets.Features( { "predictions": datasets.Value("string", id="sequence"), "references": datasets.Sequence( datasets.Value("string", id="sequence"), id="references" ), } ), reference_urls=[ "https://github.com/salaniz/pycocoevalcap", "https://github.com/tylin/coco-caption", ], ) def _download_and_prepare(self, dl_manager): self.tokenizer_path = dl_manager.download(_URLS["stanford-corenlp"]) def _compute(self, predictions, references, n=4, sigma=6.0): predications, references = tokenize( self.tokenizer_path, predictions, references ) scorer = CiderScorer(n=n, sigma=sigma) for pred, refs in zip(predications, references): scorer += (pred, refs) score, scores = scorer.compute_score() return {"CIDEr": score}