Spaces:

hage2000
/

code_eval_stdio

Sleeping

App Files Files Community

Kewen Zhao commited on Nov 23, 2024

Commit

7ddd4d3

1 Parent(s): 5906581

merge inputs into references

Browse files

Files changed (2) hide show

README.md +9 -15
code_eval_stdio.py +12 -9

README.md CHANGED Viewed

@@ -43,9 +43,7 @@ The Code Eval metric calculates how good are predictions given a set of referenc
 `predictions`: a list of candidates to evaluate. Each candidate should be a list of strings with several code candidates to solve the problem.
-`references`: a list of expected output for each prediction.
-`inputs`: a list of inputs for each problem
 `k`: number of code candidates to consider in the evaluation. The default value is `[1, 10, 100]`.
@@ -56,10 +54,9 @@ The Code Eval metric calculates how good are predictions given a set of referenc
 ```python
 from evaluate import load
 code_eval_stdio = load("hage2000/code_eval_stdio")
-inputs = ["2 3"]
-references = ["5"]
 candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))"]]
-pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, inputs = inputs, k=[1, 2])
 ```
 N.B.
@@ -89,10 +86,9 @@ Full match at `k=1`:
 ```python
 from evaluate import load
 code_eval_stdio = load("hage2000/code_eval_stdio")
-inputs = ["2 3"]
-references = ["5"]
 candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))"]]
-pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, inputs = inputs, k=[1, 2])
 print(pass_at_k)
 {'pass@1': 1.0}
 ```
@@ -102,10 +98,9 @@ No match for k = 1:
 ```python
 from evaluate import load
 code_eval_stdio = load("hage2000/code_eval_stdio")
-inputs = ["2 3"]
-references = ["5"]
 candidates = [[ "nums = list(map(int, input().split()))\nprint(nums[0]*nums[1])"]]
-pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, inputs = inputs, k=[1, 2])
 print(pass_at_k)
 {'pass@1': 0.0}
 ```
@@ -115,10 +110,9 @@ Partial match at k=1, full match at k=2:
 ```python
 from evaluate import load
 code_eval_stdio = load("hage2000/code_eval_stdio")
-inputs = ["2 3"]
-references = ["5"]
 candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))", "nums = list(map(int, input().split()))\nprint(nums[0]*nums[1])"]]
-pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, inputs = inputs, k=[1, 2])
 print(pass_at_k)
 {'pass@1': 0.5, 'pass@2': 1.0}
 ```

 `predictions`: a list of candidates to evaluate. Each candidate should be a list of strings with several code candidates to solve the problem.
+`references`: a list of tuple of [str, str], corresponding of input and expected output for each prediction.
 `k`: number of code candidates to consider in the evaluation. The default value is `[1, 10, 100]`.
 ```python
 from evaluate import load
 code_eval_stdio = load("hage2000/code_eval_stdio")
+references = [("2 3", "5")]
 candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))"]]
+pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, k=[1, 2])
 ```
 N.B.
 ```python
 from evaluate import load
 code_eval_stdio = load("hage2000/code_eval_stdio")
+references = [("2 3", "5")]
 candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))"]]
+pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, k=[1, 2])
 print(pass_at_k)
 {'pass@1': 1.0}
 ```
 ```python
 from evaluate import load
 code_eval_stdio = load("hage2000/code_eval_stdio")
+references = [("2 3", "5")]
 candidates = [[ "nums = list(map(int, input().split()))\nprint(nums[0]*nums[1])"]]
+pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, k=[1, 2])
 print(pass_at_k)
 {'pass@1': 0.0}
 ```
 ```python
 from evaluate import load
 code_eval_stdio = load("hage2000/code_eval_stdio")
+references = [("2 3", "5")]
 candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))", "nums = list(map(int, input().split()))\nprint(nums[0]*nums[1])"]]
+pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, k=[1, 2])
 print(pass_at_k)
 {'pass@1': 0.5, 'pass@2': 1.0}
 ```

code_eval_stdio.py CHANGED Viewed

@@ -77,10 +77,9 @@ Returns:
     results: dict with granular results of each unittest
 Examples:
     >>> code_eval_stdio = evaluate.load("hage2000/code_eval_stdio")
-    >>> inputs = ["2 3"]
-    >>> references = ["5"]
     >>> candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))"]]
-    >>> pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, inputs = inputs, k=[1, 2])
     >>> print(pass_at_k)
     {'pass@1': 0.5, 'pass@2': 1.0}
 """
@@ -144,7 +143,12 @@ class CodeEval(evaluate.Metric):
             features=datasets.Features(
                 {
                     "predictions": datasets.Sequence(datasets.Value("string")),
-                    "references": datasets.Value("string"),
                 }
             ),
             homepage="https://github.com/openai/human-eval",
@@ -153,12 +157,11 @@ class CodeEval(evaluate.Metric):
             license=_LICENSE,
         )
-    def _compute(self, predictions, references, inputs, k=[1, 10, 100], num_workers=4, timeout=3.0):
         """
         Returns the scores
         predictions: List[List[str]] the python program
-        references: List[str] test output
-        inputs: List[str] test input
         """
         if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
@@ -173,9 +176,9 @@ class CodeEval(evaluate.Metric):
             n_samples = 0
             results = defaultdict(list)
-            for task_id, (candidates, inputs, outputs) in enumerate(zip(predictions, inputs, references)):
                 for candidate in candidates:
-                    args = (candidate, inputs, outputs, timeout, task_id, completion_id[task_id])
                     future = executor.submit(check_correctness, *args)
                     futures.append(future)
                     completion_id[task_id] += 1

     results: dict with granular results of each unittest
 Examples:
     >>> code_eval_stdio = evaluate.load("hage2000/code_eval_stdio")
+    >>> references = [("5", "2 3")]
     >>> candidates = [[ "nums = list(map(int, input().split()))\nprint(sum(nums))"]]
+    >>> pass_at_k, results = code_eval_stdio.compute(references=references, predictions=candidates, k=[1, 2])
     >>> print(pass_at_k)
     {'pass@1': 0.5, 'pass@2': 1.0}
 """
             features=datasets.Features(
                 {
                     "predictions": datasets.Sequence(datasets.Value("string")),
+                    "references": datasets.Sequence(
+                        {
+                            "inputs": datasets.Value("string"),
+                            "expected_output": datasets.Value("string"),
+                        }
+                    ),
                 }
             ),
             homepage="https://github.com/openai/human-eval",
             license=_LICENSE,
         )
+    def _compute(self, predictions, references, k=[1, 10, 100], num_workers=4, timeout=3.0):
         """
         Returns the scores
         predictions: List[List[str]] the python program
+        references: List[Tuple[str, str]] test inputs and expected outputs
         """
         if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
             n_samples = 0
             results = defaultdict(list)
+            for task_id, (candidates, (input_data, expected_output)) in enumerate(zip(predictions, references)):
                 for candidate in candidates:
+                    args = (candidate, input_data, expected_output, timeout, task_id, completion_id[task_id])
                     future = executor.submit(check_correctness, *args)
                     futures.append(future)
                     completion_id[task_id] += 1