Kewen Zhao commited on
Commit
7ddd4d3
·
1 Parent(s): 5906581

merge inputs into references

Browse files
Files changed (2) hide show
  1. README.md +9 -15
  2. code_eval_stdio.py +12 -9
README.md CHANGED
@@ -43,9 +43,7 @@ The Code Eval metric calculates how good are predictions given a set of referenc
43
 
44
  `predictions`: a list of candidates to evaluate. Each candidate should be a list of strings with several code candidates to solve the problem.
45
 
46
- `references`: a list of expected output for each prediction.
47
-
48
- `inputs`: a list of inputs for each problem
49
 
50
  `k`: number of code candidates to consider in the evaluation. The default value is `[1, 10, 100]`.
51
 
@@ -56,10 +54,9 @@ The Code Eval metric calculates how good are predictions given a set of referenc
56
  ```python
57
  from evaluate import load
58
  code_eval_stdio = load("hage2000/code_eval_stdio")
59
- inputs = ["2 3"]
60
- references = ["5"]
61
  candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))"]]
62
- pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, inputs = inputs, k=[1, 2])
63
  ```
64
 
65
  N.B.
@@ -89,10 +86,9 @@ Full match at `k=1`:
89
  ```python
90
  from evaluate import load
91
  code_eval_stdio = load("hage2000/code_eval_stdio")
92
- inputs = ["2 3"]
93
- references = ["5"]
94
  candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))"]]
95
- pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, inputs = inputs, k=[1, 2])
96
  print(pass_at_k)
97
  {'pass@1': 1.0}
98
  ```
@@ -102,10 +98,9 @@ No match for k = 1:
102
  ```python
103
  from evaluate import load
104
  code_eval_stdio = load("hage2000/code_eval_stdio")
105
- inputs = ["2 3"]
106
- references = ["5"]
107
  candidates = [[ "nums = list(map(int, input().split()))\nprint(nums[0]*nums[1])"]]
108
- pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, inputs = inputs, k=[1, 2])
109
  print(pass_at_k)
110
  {'pass@1': 0.0}
111
  ```
@@ -115,10 +110,9 @@ Partial match at k=1, full match at k=2:
115
  ```python
116
  from evaluate import load
117
  code_eval_stdio = load("hage2000/code_eval_stdio")
118
- inputs = ["2 3"]
119
- references = ["5"]
120
  candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))", "nums = list(map(int, input().split()))\nprint(nums[0]*nums[1])"]]
121
- pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, inputs = inputs, k=[1, 2])
122
  print(pass_at_k)
123
  {'pass@1': 0.5, 'pass@2': 1.0}
124
  ```
 
43
 
44
  `predictions`: a list of candidates to evaluate. Each candidate should be a list of strings with several code candidates to solve the problem.
45
 
46
+ `references`: a list of tuple of [str, str], corresponding of input and expected output for each prediction.
 
 
47
 
48
  `k`: number of code candidates to consider in the evaluation. The default value is `[1, 10, 100]`.
49
 
 
54
  ```python
55
  from evaluate import load
56
  code_eval_stdio = load("hage2000/code_eval_stdio")
57
+ references = [("2 3", "5")]
 
58
  candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))"]]
59
+ pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, k=[1, 2])
60
  ```
61
 
62
  N.B.
 
86
  ```python
87
  from evaluate import load
88
  code_eval_stdio = load("hage2000/code_eval_stdio")
89
+ references = [("2 3", "5")]
 
90
  candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))"]]
91
+ pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, k=[1, 2])
92
  print(pass_at_k)
93
  {'pass@1': 1.0}
94
  ```
 
98
  ```python
99
  from evaluate import load
100
  code_eval_stdio = load("hage2000/code_eval_stdio")
101
+ references = [("2 3", "5")]
 
102
  candidates = [[ "nums = list(map(int, input().split()))\nprint(nums[0]*nums[1])"]]
103
+ pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, k=[1, 2])
104
  print(pass_at_k)
105
  {'pass@1': 0.0}
106
  ```
 
110
  ```python
111
  from evaluate import load
112
  code_eval_stdio = load("hage2000/code_eval_stdio")
113
+ references = [("2 3", "5")]
 
114
  candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))", "nums = list(map(int, input().split()))\nprint(nums[0]*nums[1])"]]
115
+ pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, k=[1, 2])
116
  print(pass_at_k)
117
  {'pass@1': 0.5, 'pass@2': 1.0}
118
  ```
code_eval_stdio.py CHANGED
@@ -77,10 +77,9 @@ Returns:
77
  results: dict with granular results of each unittest
78
  Examples:
79
  >>> code_eval_stdio = evaluate.load("hage2000/code_eval_stdio")
80
- >>> inputs = ["2 3"]
81
- >>> references = ["5"]
82
  >>> candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))"]]
83
- >>> pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, inputs = inputs, k=[1, 2])
84
  >>> print(pass_at_k)
85
  {'pass@1': 0.5, 'pass@2': 1.0}
86
  """
@@ -144,7 +143,12 @@ class CodeEval(evaluate.Metric):
144
  features=datasets.Features(
145
  {
146
  "predictions": datasets.Sequence(datasets.Value("string")),
147
- "references": datasets.Value("string"),
 
 
 
 
 
148
  }
149
  ),
150
  homepage="https://github.com/openai/human-eval",
@@ -153,12 +157,11 @@ class CodeEval(evaluate.Metric):
153
  license=_LICENSE,
154
  )
155
 
156
- def _compute(self, predictions, references, inputs, k=[1, 10, 100], num_workers=4, timeout=3.0):
157
  """
158
  Returns the scores
159
  predictions: List[List[str]] the python program
160
- references: List[str] test output
161
- inputs: List[str] test input
162
  """
163
 
164
  if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
@@ -173,9 +176,9 @@ class CodeEval(evaluate.Metric):
173
  n_samples = 0
174
  results = defaultdict(list)
175
 
176
- for task_id, (candidates, inputs, outputs) in enumerate(zip(predictions, inputs, references)):
177
  for candidate in candidates:
178
- args = (candidate, inputs, outputs, timeout, task_id, completion_id[task_id])
179
  future = executor.submit(check_correctness, *args)
180
  futures.append(future)
181
  completion_id[task_id] += 1
 
77
  results: dict with granular results of each unittest
78
  Examples:
79
  >>> code_eval_stdio = evaluate.load("hage2000/code_eval_stdio")
80
+ >>> references = [("5", "2 3")]
 
81
  >>> candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))"]]
82
+ >>> pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, k=[1, 2])
83
  >>> print(pass_at_k)
84
  {'pass@1': 0.5, 'pass@2': 1.0}
85
  """
 
143
  features=datasets.Features(
144
  {
145
  "predictions": datasets.Sequence(datasets.Value("string")),
146
+ "references": datasets.Sequence(
147
+ {
148
+ "inputs": datasets.Value("string"),
149
+ "expected_output": datasets.Value("string"),
150
+ }
151
+ ),
152
  }
153
  ),
154
  homepage="https://github.com/openai/human-eval",
 
157
  license=_LICENSE,
158
  )
159
 
160
+ def _compute(self, predictions, references, k=[1, 10, 100], num_workers=4, timeout=3.0):
161
  """
162
  Returns the scores
163
  predictions: List[List[str]] the python program
164
+ references: List[Tuple[str, str]] test inputs and expected outputs
 
165
  """
166
 
167
  if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
 
176
  n_samples = 0
177
  results = defaultdict(list)
178
 
179
+ for task_id, (candidates, (input_data, expected_output)) in enumerate(zip(predictions, references)):
180
  for candidate in candidates:
181
+ args = (candidate, input_data, expected_output, timeout, task_id, completion_id[task_id])
182
  future = executor.submit(check_correctness, *args)
183
  futures.append(future)
184
  completion_id[task_id] += 1