Spaces:
Sleeping
Sleeping
Kewen Zhao
commited on
Commit
·
7ddd4d3
1
Parent(s):
5906581
merge inputs into references
Browse files- README.md +9 -15
- code_eval_stdio.py +12 -9
README.md
CHANGED
@@ -43,9 +43,7 @@ The Code Eval metric calculates how good are predictions given a set of referenc
|
|
43 |
|
44 |
`predictions`: a list of candidates to evaluate. Each candidate should be a list of strings with several code candidates to solve the problem.
|
45 |
|
46 |
-
`references`: a list of expected output for each prediction.
|
47 |
-
|
48 |
-
`inputs`: a list of inputs for each problem
|
49 |
|
50 |
`k`: number of code candidates to consider in the evaluation. The default value is `[1, 10, 100]`.
|
51 |
|
@@ -56,10 +54,9 @@ The Code Eval metric calculates how good are predictions given a set of referenc
|
|
56 |
```python
|
57 |
from evaluate import load
|
58 |
code_eval_stdio = load("hage2000/code_eval_stdio")
|
59 |
-
|
60 |
-
references = ["5"]
|
61 |
candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))"]]
|
62 |
-
pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates,
|
63 |
```
|
64 |
|
65 |
N.B.
|
@@ -89,10 +86,9 @@ Full match at `k=1`:
|
|
89 |
```python
|
90 |
from evaluate import load
|
91 |
code_eval_stdio = load("hage2000/code_eval_stdio")
|
92 |
-
|
93 |
-
references = ["5"]
|
94 |
candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))"]]
|
95 |
-
pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates,
|
96 |
print(pass_at_k)
|
97 |
{'pass@1': 1.0}
|
98 |
```
|
@@ -102,10 +98,9 @@ No match for k = 1:
|
|
102 |
```python
|
103 |
from evaluate import load
|
104 |
code_eval_stdio = load("hage2000/code_eval_stdio")
|
105 |
-
|
106 |
-
references = ["5"]
|
107 |
candidates = [[ "nums = list(map(int, input().split()))\nprint(nums[0]*nums[1])"]]
|
108 |
-
pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates,
|
109 |
print(pass_at_k)
|
110 |
{'pass@1': 0.0}
|
111 |
```
|
@@ -115,10 +110,9 @@ Partial match at k=1, full match at k=2:
|
|
115 |
```python
|
116 |
from evaluate import load
|
117 |
code_eval_stdio = load("hage2000/code_eval_stdio")
|
118 |
-
|
119 |
-
references = ["5"]
|
120 |
candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))", "nums = list(map(int, input().split()))\nprint(nums[0]*nums[1])"]]
|
121 |
-
pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates,
|
122 |
print(pass_at_k)
|
123 |
{'pass@1': 0.5, 'pass@2': 1.0}
|
124 |
```
|
|
|
43 |
|
44 |
`predictions`: a list of candidates to evaluate. Each candidate should be a list of strings with several code candidates to solve the problem.
|
45 |
|
46 |
+
`references`: a list of tuple of [str, str], corresponding of input and expected output for each prediction.
|
|
|
|
|
47 |
|
48 |
`k`: number of code candidates to consider in the evaluation. The default value is `[1, 10, 100]`.
|
49 |
|
|
|
54 |
```python
|
55 |
from evaluate import load
|
56 |
code_eval_stdio = load("hage2000/code_eval_stdio")
|
57 |
+
references = [("2 3", "5")]
|
|
|
58 |
candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))"]]
|
59 |
+
pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, k=[1, 2])
|
60 |
```
|
61 |
|
62 |
N.B.
|
|
|
86 |
```python
|
87 |
from evaluate import load
|
88 |
code_eval_stdio = load("hage2000/code_eval_stdio")
|
89 |
+
references = [("2 3", "5")]
|
|
|
90 |
candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))"]]
|
91 |
+
pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, k=[1, 2])
|
92 |
print(pass_at_k)
|
93 |
{'pass@1': 1.0}
|
94 |
```
|
|
|
98 |
```python
|
99 |
from evaluate import load
|
100 |
code_eval_stdio = load("hage2000/code_eval_stdio")
|
101 |
+
references = [("2 3", "5")]
|
|
|
102 |
candidates = [[ "nums = list(map(int, input().split()))\nprint(nums[0]*nums[1])"]]
|
103 |
+
pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, k=[1, 2])
|
104 |
print(pass_at_k)
|
105 |
{'pass@1': 0.0}
|
106 |
```
|
|
|
110 |
```python
|
111 |
from evaluate import load
|
112 |
code_eval_stdio = load("hage2000/code_eval_stdio")
|
113 |
+
references = [("2 3", "5")]
|
|
|
114 |
candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))", "nums = list(map(int, input().split()))\nprint(nums[0]*nums[1])"]]
|
115 |
+
pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, k=[1, 2])
|
116 |
print(pass_at_k)
|
117 |
{'pass@1': 0.5, 'pass@2': 1.0}
|
118 |
```
|
code_eval_stdio.py
CHANGED
@@ -77,10 +77,9 @@ Returns:
|
|
77 |
results: dict with granular results of each unittest
|
78 |
Examples:
|
79 |
>>> code_eval_stdio = evaluate.load("hage2000/code_eval_stdio")
|
80 |
-
>>>
|
81 |
-
>>> references = ["5"]
|
82 |
>>> candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))"]]
|
83 |
-
>>> pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates,
|
84 |
>>> print(pass_at_k)
|
85 |
{'pass@1': 0.5, 'pass@2': 1.0}
|
86 |
"""
|
@@ -144,7 +143,12 @@ class CodeEval(evaluate.Metric):
|
|
144 |
features=datasets.Features(
|
145 |
{
|
146 |
"predictions": datasets.Sequence(datasets.Value("string")),
|
147 |
-
"references": datasets.
|
|
|
|
|
|
|
|
|
|
|
148 |
}
|
149 |
),
|
150 |
homepage="https://github.com/openai/human-eval",
|
@@ -153,12 +157,11 @@ class CodeEval(evaluate.Metric):
|
|
153 |
license=_LICENSE,
|
154 |
)
|
155 |
|
156 |
-
def _compute(self, predictions, references,
|
157 |
"""
|
158 |
Returns the scores
|
159 |
predictions: List[List[str]] the python program
|
160 |
-
references: List[str] test
|
161 |
-
inputs: List[str] test input
|
162 |
"""
|
163 |
|
164 |
if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
|
@@ -173,9 +176,9 @@ class CodeEval(evaluate.Metric):
|
|
173 |
n_samples = 0
|
174 |
results = defaultdict(list)
|
175 |
|
176 |
-
for task_id, (candidates,
|
177 |
for candidate in candidates:
|
178 |
-
args = (candidate,
|
179 |
future = executor.submit(check_correctness, *args)
|
180 |
futures.append(future)
|
181 |
completion_id[task_id] += 1
|
|
|
77 |
results: dict with granular results of each unittest
|
78 |
Examples:
|
79 |
>>> code_eval_stdio = evaluate.load("hage2000/code_eval_stdio")
|
80 |
+
>>> references = [("5", "2 3")]
|
|
|
81 |
>>> candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))"]]
|
82 |
+
>>> pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, k=[1, 2])
|
83 |
>>> print(pass_at_k)
|
84 |
{'pass@1': 0.5, 'pass@2': 1.0}
|
85 |
"""
|
|
|
143 |
features=datasets.Features(
|
144 |
{
|
145 |
"predictions": datasets.Sequence(datasets.Value("string")),
|
146 |
+
"references": datasets.Sequence(
|
147 |
+
{
|
148 |
+
"inputs": datasets.Value("string"),
|
149 |
+
"expected_output": datasets.Value("string"),
|
150 |
+
}
|
151 |
+
),
|
152 |
}
|
153 |
),
|
154 |
homepage="https://github.com/openai/human-eval",
|
|
|
157 |
license=_LICENSE,
|
158 |
)
|
159 |
|
160 |
+
def _compute(self, predictions, references, k=[1, 10, 100], num_workers=4, timeout=3.0):
|
161 |
"""
|
162 |
Returns the scores
|
163 |
predictions: List[List[str]] the python program
|
164 |
+
references: List[Tuple[str, str]] test inputs and expected outputs
|
|
|
165 |
"""
|
166 |
|
167 |
if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
|
|
|
176 |
n_samples = 0
|
177 |
results = defaultdict(list)
|
178 |
|
179 |
+
for task_id, (candidates, (input_data, expected_output)) in enumerate(zip(predictions, references)):
|
180 |
for candidate in candidates:
|
181 |
+
args = (candidate, input_data, expected_output, timeout, task_id, completion_id[task_id])
|
182 |
future = executor.submit(check_correctness, *args)
|
183 |
futures.append(future)
|
184 |
completion_id[task_id] += 1
|