|
from typing import List |
|
import datasets |
|
import evaluate |
|
import os |
|
import tempfile |
|
import subprocess |
|
|
|
from pycocoevalcap.cider.cider import CiderScorer, Cider |
|
|
|
_DESCRIPTION = """ |
|
The CIDEr (Consensus-based Image Description Evaluation) metric is used to evaluate the quality of image captions generated by models in image captioning tasks. |
|
It measures how well the generated caption matches human-written reference captions by considering both the frequency and the relevance of words or phrases. |
|
Here is the formula for the CIDEr metric in LaTeX code: |
|
|
|
$ |
|
\\text{CIDEr}(c_i, C) = \\frac{1}{N} \\sum_{n=1}^{N} w_n \\cdot \\frac{\\sum_{j=1}^{m} \\text{IDF}(g_j) \\cdot \\text{TF}(g_j, c_i)}{\\sum_{j=1}^{m} \\text{IDF}(g_j) \\cdot \\text{TF}(g_j, C)} |
|
$ |
|
|
|
where: |
|
- $ c_i $ is the candidate caption, |
|
- $ C $ is the set of reference captions, |
|
- $ N $ is the number of n-grams (typically 1 to 4), |
|
- $ w_n $ is the weight for the n-gram, |
|
- $ g_j $ represents the j-th n-gram, |
|
- $ \\text{TF}(g_j, c_i) $ is the term frequency of the n-gram $ g_j $ in the candidate caption $ c_i $, |
|
- $ \\text{TF}(g_j, C) $ is the term frequency of the n-gram $ g_j $ in the reference captions $ C $, |
|
- $ \\text{IDF}(g_j) $ is the inverse document frequency of the n-gram $ g_j $. |
|
""" |
|
|
|
|
|
_KWARGS_DESCRIPTION = """ |
|
Args: |
|
predictions (`list` of `str`): Predicted captions. |
|
references (`list` of `str` lists): Ground truth captions. |
|
n (int, defaults to 4): Number of ngrams for which (ngram) representation is calculated. |
|
sigma (float, defaults to 6.0): The standard deviation parameter for gaussian penalty. |
|
|
|
Returns: |
|
CIDEr (`float`): CIDEr value. Minimum possible value is 0. Maximum possible value is 100. |
|
|
|
""" |
|
|
|
|
|
_CITATION = """ |
|
@inproceedings{vedantam2015cider, |
|
title={Cider: Consensus-based image description evaluation}, |
|
author={Vedantam, Ramakrishna and Lawrence Zitnick, C and Parikh, Devi}, |
|
booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, |
|
pages={4566--4575}, |
|
year={2015} |
|
} |
|
""" |
|
|
|
_URLS = { |
|
"stanford-corenlp": "https://repo1.maven.org/maven2/edu/stanford/nlp/stanford-corenlp/3.4.1/stanford-corenlp-3.4.1.jar" |
|
} |
|
|
|
|
|
def tokenize(tokenizer_path: str, predictions: List[str], references: List[List[str]]): |
|
PUNCTUATIONS = [ |
|
"''", |
|
"'", |
|
"``", |
|
"`", |
|
"-LRB-", |
|
"-RRB-", |
|
"-LCB-", |
|
"-RCB-", |
|
".", |
|
"?", |
|
"!", |
|
",", |
|
":", |
|
"-", |
|
"--", |
|
"...", |
|
";", |
|
] |
|
|
|
cmd = [ |
|
"java", |
|
"-cp", |
|
tokenizer_path, |
|
"edu.stanford.nlp.process.PTBTokenizer", |
|
"-preserveLines", |
|
"-lowerCase", |
|
] |
|
|
|
sentences = "\n".join( |
|
[ |
|
s.replace("\n", " ") |
|
for s in predictions + [ref for refs in references for ref in refs] |
|
] |
|
) |
|
|
|
with tempfile.NamedTemporaryFile(delete=False) as f: |
|
f.write(sentences.encode()) |
|
|
|
cmd.append(f.name) |
|
p_tokenizer = subprocess.Popen(cmd, stdout=subprocess.PIPE) |
|
token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0] |
|
token_lines = token_lines.decode() |
|
lines = [ |
|
" ".join([w for w in line.rstrip().split(" ") if w not in PUNCTUATIONS]) |
|
for line in token_lines.split("\n") |
|
] |
|
|
|
os.remove(f.name) |
|
|
|
pred_size = len(predictions) |
|
ref_sizes = [len(ref) for ref in references] |
|
|
|
predictions = lines[:pred_size] |
|
start = pred_size |
|
references = [] |
|
for size in ref_sizes: |
|
references.append(lines[start : start + size]) |
|
start += size |
|
|
|
return predictions, references |
|
|
|
|
|
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
|
class CIDEr(evaluate.Metric): |
|
def _info(self): |
|
return evaluate.MetricInfo( |
|
description=_DESCRIPTION, |
|
citation=_CITATION, |
|
inputs_description=_KWARGS_DESCRIPTION, |
|
features=datasets.Features( |
|
{ |
|
"predictions": datasets.Value("string", id="sequence"), |
|
"references": datasets.Sequence( |
|
datasets.Value("string", id="sequence"), id="references" |
|
), |
|
} |
|
), |
|
reference_urls=[ |
|
"https://github.com/salaniz/pycocoevalcap", |
|
"https://github.com/tylin/coco-caption", |
|
], |
|
) |
|
|
|
def _download_and_prepare(self, dl_manager): |
|
self.tokenizer_path = dl_manager.download(_URLS["stanford-corenlp"]) |
|
|
|
def _compute(self, predictions, references, n=4, sigma=6.0): |
|
predications, references = tokenize( |
|
self.tokenizer_path, predictions, references |
|
) |
|
scorer = CiderScorer(n, sigma) |
|
for pred, refs in zip(predications, references): |
|
scorer += (pred, refs) |
|
score, scores = scorer.compute_score() |
|
return {"CIDEr": score} |
|
|