bdsaglam commited on
Commit
898eb24
·
1 Parent(s): 42090ad

add musique metric

Browse files
Files changed (2) hide show
  1. musique.py +109 -29
  2. tests.py +4 -14
musique.py CHANGED
@@ -13,6 +13,10 @@
13
  # limitations under the License.
14
  """TODO: Add a description here."""
15
 
 
 
 
 
16
  import evaluate
17
  import datasets
18
 
@@ -26,40 +30,37 @@ year={2020}
26
  }
27
  """
28
 
29
- # TODO: Add description of the module here
30
  _DESCRIPTION = """\
31
- This new module is designed to solve this great ML task and is crafted with a lot of care.
 
 
 
32
  """
33
 
34
 
35
- # TODO: Add description of the arguments of the module here
36
  _KWARGS_DESCRIPTION = """
37
  Calculates how good are predictions given some references, using certain scores
38
  Args:
39
- predictions: list of predictions to score. Each predictions
40
- should be a string with tokens separated by spaces.
41
- references: list of reference for each prediction. Each
42
- reference should be a string with tokens separated by spaces.
43
  Returns:
44
- accuracy: description of the first score,
45
- another_score: description of the second score,
46
  Examples:
47
- Examples should be written in doctest format, and should illustrate how
48
- to use the function.
49
-
50
- >>> my_new_module = evaluate.load("my_new_module")
51
- >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
52
  >>> print(results)
53
- {'accuracy': 1.0}
54
  """
55
 
56
- # TODO: Define external resources urls if needed
57
- BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
58
-
59
 
60
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
61
  class musique(evaluate.Metric):
62
- """TODO: Short description of my evaluation module."""
63
 
64
  def _info(self):
65
  # TODO: Specifies the evaluate.EvaluationModuleInfo object
@@ -70,26 +71,105 @@ class musique(evaluate.Metric):
70
  citation=_CITATION,
71
  inputs_description=_KWARGS_DESCRIPTION,
72
  # This defines the format of each prediction and reference
73
- features=datasets.Features({
74
- 'predictions': datasets.Value('int64'),
75
- 'references': datasets.Value('int64'),
76
- }),
 
 
 
 
77
  # Homepage of the module for documentation
78
  homepage="http://module.homepage",
79
  # Additional links to the codebase or references
80
  codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
81
- reference_urls=["http://path.to.reference.url/new_module"]
82
  )
83
 
84
  def _download_and_prepare(self, dl_manager):
85
  """Optional: download external resources useful to compute the scores"""
86
- # TODO: Download external resources if needed
87
  pass
88
 
89
  def _compute(self, predictions, references):
90
  """Returns the scores"""
91
- # TODO: Compute the different scores of the module
92
- accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  return {
94
- "accuracy": accuracy,
95
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  # limitations under the License.
14
  """TODO: Add a description here."""
15
 
16
+ import re
17
+ import string
18
+ import collections
19
+ from typing import Callable
20
  import evaluate
21
  import datasets
22
 
 
30
  }
31
  """
32
 
 
33
  _DESCRIPTION = """\
34
+ Question-answering metrics (`Exact Match` and `F1`) for Musique-Answerable dataset.
35
+
36
+ The implementation is taken from Musique repository.
37
+ https://github.com/StonyBrookNLP/musique
38
  """
39
 
40
 
 
41
  _KWARGS_DESCRIPTION = """
42
  Calculates how good are predictions given some references, using certain scores
43
  Args:
44
+ predictions: list of predicted answers.
45
+ references: list of ground truth answers. Each reference should be a list of
46
+ ground truth answers for the corresponding prediction.
 
47
  Returns:
48
+ exact_match: Exact match score,
49
+ f1: F1 score over tokens
50
  Examples:
51
+ >>> my_new_module = evaluate.load("musique")
52
+ >>> results = my_new_module.compute(
53
+ references=[["New York City", "NYC"], ["Einstein", "Albert Einstein"]],
54
+ predictions=["New York City", "Albert Einstein"],
55
+ )
56
  >>> print(results)
57
+ {'exact_match': 1.0, 'f1': 1.0}
58
  """
59
 
 
 
 
60
 
61
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
62
  class musique(evaluate.Metric):
63
+ """TODO: Question answering metrics (EM and F1) for Musique-Answerable dataset."""
64
 
65
  def _info(self):
66
  # TODO: Specifies the evaluate.EvaluationModuleInfo object
 
71
  citation=_CITATION,
72
  inputs_description=_KWARGS_DESCRIPTION,
73
  # This defines the format of each prediction and reference
74
+ features=datasets.Features(
75
+ {
76
+ "predictions": datasets.features.Sequence(datasets.Value("string")),
77
+ "references": datasets.features.Sequence(
78
+ datasets.features.Sequence(datasets.Value("string"))
79
+ ),
80
+ }
81
+ ),
82
  # Homepage of the module for documentation
83
  homepage="http://module.homepage",
84
  # Additional links to the codebase or references
85
  codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
86
+ reference_urls=["http://path.to.reference.url/new_module"],
87
  )
88
 
89
  def _download_and_prepare(self, dl_manager):
90
  """Optional: download external resources useful to compute the scores"""
 
91
  pass
92
 
93
  def _compute(self, predictions, references):
94
  """Returns the scores"""
95
+
96
+ if len(predictions) != len(references):
97
+ raise ValueError(
98
+ "The number of predictions and references should be the same."
99
+ )
100
+
101
+ if len(predictions) == 0:
102
+ return {"exact_match": 0.0, "f1": 0.0}
103
+
104
+ exact_scores = [
105
+ metric_max_over_ground_truths(compute_exact, prediction, reference)
106
+ for prediction, reference in zip(predictions, references)
107
+ ]
108
+ f1_scores = [
109
+ metric_max_over_ground_truths(compute_f1, prediction, reference)
110
+ for prediction, reference in zip(predictions, references)
111
+ ]
112
  return {
113
+ "exact_match": sum(exact_scores) / len(exact_scores),
114
+ "f1": sum(f1_scores) / len(f1_scores),
115
+ }
116
+
117
+
118
+ # Source: https://github.com/StonyBrookNLP/musique/blob/main/metrics/answer.py
119
+
120
+
121
+ def normalize_answer(s):
122
+ """Lower text and remove punctuation, articles and extra whitespace."""
123
+
124
+ def remove_articles(text):
125
+ regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
126
+ return re.sub(regex, " ", text)
127
+
128
+ def white_space_fix(text):
129
+ return " ".join(text.split())
130
+
131
+ def remove_punc(text):
132
+ exclude = set(string.punctuation)
133
+ return "".join(ch for ch in text if ch not in exclude)
134
+
135
+ def lower(text):
136
+ return text.lower()
137
+
138
+ return white_space_fix(remove_articles(remove_punc(lower(s))))
139
+
140
+
141
+ def get_tokens(s):
142
+ if not s:
143
+ return []
144
+ return normalize_answer(s).split()
145
+
146
+
147
+ def compute_exact(a_gold, a_pred):
148
+ return int(normalize_answer(a_gold) == normalize_answer(a_pred))
149
+
150
+
151
+ def compute_f1(a_gold, a_pred):
152
+ gold_toks = get_tokens(a_gold)
153
+ pred_toks = get_tokens(a_pred)
154
+ common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
155
+ num_same = sum(common.values())
156
+ if len(gold_toks) == 0 or len(pred_toks) == 0:
157
+ # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
158
+ return int(gold_toks == pred_toks)
159
+ if num_same == 0:
160
+ return 0
161
+ precision = 1.0 * num_same / len(pred_toks)
162
+ recall = 1.0 * num_same / len(gold_toks)
163
+ f1 = (2 * precision * recall) / (precision + recall)
164
+ return f1
165
+
166
+
167
+ def metric_max_over_ground_truths(
168
+ metric_fn: Callable[[str, str], float],
169
+ prediction: str,
170
+ ground_truths: list[str],
171
+ ) -> float:
172
+ scores_for_ground_truths = [
173
+ metric_fn(prediction, ground_truth) for ground_truth in ground_truths
174
+ ]
175
+ return max(scores_for_ground_truths)
tests.py CHANGED
@@ -1,17 +1,7 @@
1
  test_cases = [
2
  {
3
- "predictions": [0, 0],
4
- "references": [1, 1],
5
- "result": {"metric_score": 0}
6
  },
7
- {
8
- "predictions": [1, 1],
9
- "references": [1, 1],
10
- "result": {"metric_score": 1}
11
- },
12
- {
13
- "predictions": [1, 0],
14
- "references": [1, 1],
15
- "result": {"metric_score": 0.5}
16
- }
17
- ]
 
1
  test_cases = [
2
  {
3
+ "predictions": ["New York City", "Albert Einstein"],
4
+ "references": [["New York City", "NYC"], ["Einstein", "Albert Einstein"]],
5
+ "result": {"exact_match": 1.0, "f1": 1.0},
6
  },
7
+ ]