qgallouedec HF staff commited on
Commit
75cad04
·
1 Parent(s): a40ca17

update to results_v2

Browse files
Files changed (3) hide show
  1. requirements.txt +1 -0
  2. src/backend.py +166 -54
  3. src/evaluation.py +18 -39
requirements.txt CHANGED
@@ -11,6 +11,7 @@ free-mujoco-py
11
  mujoco<=2.3.7
12
  numpy==1.24.2
13
  pandas==2.0.0
 
14
  python-dateutil==2.8.2
15
  requests==2.28.2
16
  rliable==1.0.8
 
11
  mujoco<=2.3.7
12
  numpy==1.24.2
13
  pandas==2.0.0
14
+ pybullet_envs_gymnasium==0.4.0
15
  python-dateutil==2.8.2
16
  requests==2.28.2
17
  rliable==1.0.8
src/backend.py CHANGED
@@ -1,10 +1,11 @@
1
- import json
2
  import os
3
  import random
4
- import re
5
- import tempfile
6
 
7
- from huggingface_hub import CommitOperationAdd, HfApi
 
 
8
 
9
  from src.evaluation import evaluate
10
  from src.logging import setup_logger
@@ -12,71 +13,182 @@ from src.logging import setup_logger
12
  logger = setup_logger(__name__)
13
 
14
  API = HfApi(token=os.environ.get("TOKEN"))
15
- RESULTS_REPO = "open-rl-leaderboard/results"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
 
18
  def _backend_routine():
19
  # List only the text classification models
20
- rl_models = list(API.list_models(filter="reinforcement-learning"))
21
  logger.info(f"Found {len(rl_models)} RL models")
22
- compatible_models = []
23
- for model in rl_models:
 
 
 
 
 
24
  filenames = [sib.rfilename for sib in model.siblings]
25
  if "agent.pt" in filenames:
26
- compatible_models.append((model.modelId, model.sha))
27
 
28
- logger.info(f"Found {len(compatible_models)} compatible models")
29
 
30
- # Get the results
31
- pattern = re.compile(r"^[^/]*/[^/]*/[^/]*results_[a-f0-9]+\.json$")
32
- filenames = API.list_repo_files(RESULTS_REPO, repo_type="dataset")
33
- filenames = [filename for filename in filenames if pattern.match(filename)]
34
-
35
- evaluated_models = set()
36
- for filename in filenames:
37
- path = API.hf_hub_download(repo_id=RESULTS_REPO, filename=filename, repo_type="dataset")
38
- with open(path) as fp:
39
- report = json.load(fp)
40
- evaluated_models.add((report["config"]["model_id"], report["config"]["model_sha"]))
41
 
42
- # Find the models that are not associated with any results
43
- pending_models = list(set(compatible_models) - evaluated_models)
44
- logger.info(f"Found {len(pending_models)} pending models")
45
 
46
- if len(pending_models) == 0:
47
- return None
 
 
48
 
49
  # Run an evaluation on the models
50
- with tempfile.TemporaryDirectory() as tmp_dir:
51
- commits = []
52
- model_id, sha = random.choice(pending_models)
53
- logger.info(f"Running evaluation on {model_id}")
54
- report = {"config": {"model_id": model_id, "model_sha": sha}}
 
 
 
55
  try:
56
- evaluations = evaluate(model_id, revision=sha)
 
 
 
57
  except Exception as e:
58
- logger.error(f"Error evaluating {model_id}: {e}")
59
- evaluations = None
60
-
61
- if evaluations is not None:
62
- report["results"] = evaluations
63
- report["status"] = "DONE"
64
- else:
65
- report["status"] = "FAILED"
66
-
67
- # Update the results
68
- dumped = json.dumps(report, indent=2)
69
- path_in_repo = f"{model_id}/results_{sha}.json"
70
- local_path = os.path.join(tmp_dir, path_in_repo)
71
- os.makedirs(os.path.dirname(local_path), exist_ok=True)
72
- with open(local_path, "w") as f:
73
- f.write(dumped)
74
-
75
- commits.append(CommitOperationAdd(path_in_repo=path_in_repo, path_or_fileobj=local_path))
76
-
77
- API.create_commit(
78
- repo_id=RESULTS_REPO, commit_message="Add evaluation results", operations=commits, repo_type="dataset"
79
- )
80
 
81
 
82
  def backend_routine():
 
1
+ import fnmatch
2
  import os
3
  import random
4
+ import time
 
5
 
6
+ import pybullet_envs_gymnasium # noqa: F401 pylint: disable=unused-import
7
+ from datasets import load_dataset
8
+ from huggingface_hub import HfApi
9
 
10
  from src.evaluation import evaluate
11
  from src.logging import setup_logger
 
13
  logger = setup_logger(__name__)
14
 
15
  API = HfApi(token=os.environ.get("TOKEN"))
16
+ RESULTS_REPO = "open-rl-leaderboard/results_v2"
17
+
18
+ ALL_ENV_IDS = [
19
+ "AdventureNoFrameskip-v4",
20
+ "AirRaidNoFrameskip-v4",
21
+ "AlienNoFrameskip-v4",
22
+ "AmidarNoFrameskip-v4",
23
+ "AssaultNoFrameskip-v4",
24
+ "AsterixNoFrameskip-v4",
25
+ "AsteroidsNoFrameskip-v4",
26
+ "AtlantisNoFrameskip-v4",
27
+ "BankHeistNoFrameskip-v4",
28
+ "BattleZoneNoFrameskip-v4",
29
+ "BeamRiderNoFrameskip-v4",
30
+ "BerzerkNoFrameskip-v4",
31
+ "BowlingNoFrameskip-v4",
32
+ "BoxingNoFrameskip-v4",
33
+ "BreakoutNoFrameskip-v4",
34
+ "CarnivalNoFrameskip-v4",
35
+ "CentipedeNoFrameskip-v4",
36
+ "ChopperCommandNoFrameskip-v4",
37
+ "CrazyClimberNoFrameskip-v4",
38
+ "DefenderNoFrameskip-v4",
39
+ "DemonAttackNoFrameskip-v4",
40
+ "DoubleDunkNoFrameskip-v4",
41
+ "ElevatorActionNoFrameskip-v4",
42
+ "EnduroNoFrameskip-v4",
43
+ "FishingDerbyNoFrameskip-v4",
44
+ "FreewayNoFrameskip-v4",
45
+ "FrostbiteNoFrameskip-v4",
46
+ "GopherNoFrameskip-v4",
47
+ "GravitarNoFrameskip-v4",
48
+ "HeroNoFrameskip-v4",
49
+ "IceHockeyNoFrameskip-v4",
50
+ "JamesbondNoFrameskip-v4",
51
+ "JourneyEscapeNoFrameskip-v4",
52
+ "KangarooNoFrameskip-v4",
53
+ "KrullNoFrameskip-v4",
54
+ "KungFuMasterNoFrameskip-v4",
55
+ "MontezumaRevengeNoFrameskip-v4",
56
+ "MsPacmanNoFrameskip-v4",
57
+ "NameThisGameNoFrameskip-v4",
58
+ "PhoenixNoFrameskip-v4",
59
+ "PitfallNoFrameskip-v4",
60
+ "PongNoFrameskip-v4",
61
+ "PooyanNoFrameskip-v4",
62
+ "PrivateEyeNoFrameskip-v4",
63
+ "QbertNoFrameskip-v4",
64
+ "RiverraidNoFrameskip-v4",
65
+ "RoadRunnerNoFrameskip-v4",
66
+ "RobotankNoFrameskip-v4",
67
+ "SeaquestNoFrameskip-v4",
68
+ "SkiingNoFrameskip-v4",
69
+ "SolarisNoFrameskip-v4",
70
+ "SpaceInvadersNoFrameskip-v4",
71
+ "StarGunnerNoFrameskip-v4",
72
+ "TennisNoFrameskip-v4",
73
+ "TimePilotNoFrameskip-v4",
74
+ "TutankhamNoFrameskip-v4",
75
+ "UpNDownNoFrameskip-v4",
76
+ "VentureNoFrameskip-v4",
77
+ "VideoPinballNoFrameskip-v4",
78
+ "WizardOfWorNoFrameskip-v4",
79
+ "YarsRevengeNoFrameskip-v4",
80
+ "ZaxxonNoFrameskip-v4",
81
+ # Box2D
82
+ "BipedalWalker-v3",
83
+ "BipedalWalkerHardcore-v3",
84
+ "CarRacing-v2",
85
+ "LunarLander-v2",
86
+ "LunarLanderContinuous-v2",
87
+ # Toy text
88
+ "Blackjack-v1",
89
+ "CliffWalking-v0",
90
+ "FrozenLake-v1",
91
+ "FrozenLake8x8-v1",
92
+ # Classic control
93
+ "Acrobot-v1",
94
+ "CartPole-v1",
95
+ "MountainCar-v0",
96
+ "MountainCarContinuous-v0",
97
+ "Pendulum-v1",
98
+ # MuJoCo
99
+ "Ant-v4",
100
+ "HalfCheetah-v4",
101
+ "Hopper-v4",
102
+ "Humanoid-v4",
103
+ "HumanoidStandup-v4",
104
+ "InvertedDoublePendulum-v4",
105
+ "InvertedPendulum-v4",
106
+ "Pusher-v4",
107
+ "Reacher-v4",
108
+ "Swimmer-v4",
109
+ "Walker2d-v4",
110
+ # PyBullet
111
+ "AntBulletEnv-v0",
112
+ "HalfCheetahBulletEnv-v0",
113
+ "HopperBulletEnv-v0",
114
+ "HumanoidBulletEnv-v0",
115
+ "InvertedDoublePendulumBulletEnv-v0",
116
+ "InvertedPendulumSwingupBulletEnv-v0",
117
+ "MinitaurBulletEnv-v0",
118
+ "ReacherBulletEnv-v0",
119
+ "Walker2DBulletEnv-v0",
120
+ ]
121
+
122
+
123
+ def pattern_match(patterns, source_list):
124
+ if isinstance(patterns, str):
125
+ patterns = [patterns]
126
+
127
+ env_ids = set()
128
+ for pattern in patterns:
129
+ for matching in fnmatch.filter(source_list, pattern):
130
+ env_ids.add(matching)
131
+ return sorted(list(env_ids))
132
 
133
 
134
  def _backend_routine():
135
  # List only the text classification models
136
+ rl_models = [(model.modelId, model.sha) for model in API.list_models(filter=["reinforcement-learning"])]
137
  logger.info(f"Found {len(rl_models)} RL models")
138
+ dataset = load_dataset(
139
+ RESULTS_REPO, split="train", download_mode="force_redownload", verification_mode="no_checks"
140
+ )
141
+ evaluated_models = [("/".join([x["user_id"], x["model_id"]]), x["sha"]) for x in dataset]
142
+ pending_models = list(set(rl_models) - set(evaluated_models))
143
+ pending_and_compatible_models = []
144
+ for model in pending_models:
145
  filenames = [sib.rfilename for sib in model.siblings]
146
  if "agent.pt" in filenames:
147
+ pending_and_compatible_models.append((model.modelId, model.sha))
148
 
149
+ logger.info(f"Found {len(pending_and_compatible_models)} compatible pending models")
150
 
151
+ if len(pending_and_compatible_models) == 0:
152
+ return None
 
 
 
 
 
 
 
 
 
153
 
154
+ # Shuffle the dataset
155
+ random.shuffle(pending_and_compatible_models)
 
156
 
157
+ # Select a random model
158
+ repo_id, sha = pending_and_compatible_models.pop()
159
+ user_id, model_id = repo_id.split("/")
160
+ row = {"model_id": model_id, "user_id": user_id, "sha": sha}
161
 
162
  # Run an evaluation on the models
163
+ model_info = API.model_info(repo_id, revision=sha)
164
+
165
+ # Extract the environment IDs from the tags (usually only one)
166
+ env_ids = pattern_match(model_info.tags, ALL_ENV_IDS)
167
+ if len(env_ids) > 0:
168
+ env_id = env_ids[0]
169
+ logger.info(f"Running evaluation on {user_id}/{model_id}")
170
+
171
  try:
172
+ episodic_returns = evaluate(repo_id, sha, env_id)
173
+ row["status"] = "DONE"
174
+ row["env_id"] = env_id
175
+ row["episodic_returns"] = episodic_returns
176
  except Exception as e:
177
+ logger.error(f"Error evaluating {repo_id}: {e}")
178
+ logger.exception(e)
179
+ row["status"] = "FAILED"
180
+
181
+ else:
182
+ logger.error(f"No environment found for {model_id}")
183
+ row["status"] = "FAILED"
184
+
185
+ # load the last version of the dataset
186
+ dataset = load_dataset(
187
+ RESULTS_REPO, split="train", download_mode="force_redownload", verification_mode="no_checks"
188
+ )
189
+ dataset.add_item(row)
190
+ dataset.push_to_hub(RESULTS_REPO, split="train", token=API.token)
191
+ time.sleep(60) # Sleep for 1 minute to avoid rate limiting
 
 
 
 
 
 
 
192
 
193
 
194
  def backend_routine():
src/evaluation.py CHANGED
@@ -1,4 +1,3 @@
1
- import fnmatch
2
  import os
3
  from typing import Dict, SupportsFloat
4
 
@@ -303,35 +302,18 @@ def make(env_id):
303
  return thunk
304
 
305
 
306
- def pattern_match(patterns, source_list):
307
- if isinstance(patterns, str):
308
- patterns = [patterns]
309
-
310
- env_ids = set()
311
- for pattern in patterns:
312
- for matching in fnmatch.filter(source_list, pattern):
313
- env_ids.add(matching)
314
- return sorted(list(env_ids))
315
-
316
-
317
- def evaluate(model_id, revision):
318
- tags = API.model_info(model_id, revision=revision).tags
319
-
320
- # Extract the environment IDs from the tags (usually only one)
321
- env_ids = pattern_match(tags, ALL_ENV_IDS)
322
- logger.info(f"Selected environments: {env_ids}")
323
-
324
- results = {}
325
 
326
  # Check if the agent exists
327
  try:
328
- agent_path = API.hf_hub_download(repo_id=model_id, filename="agent.pt")
329
  except EntryNotFoundError:
330
  logger.error("Agent not found")
331
  return None
332
 
333
  # Check safety
334
- security = next(iter(API.get_paths_info(model_id, "agent.pt", expand=True))).security
335
  if security is None or "safe" not in security:
336
  logger.warn("Agent safety not available")
337
  # return None
@@ -341,25 +323,22 @@ def evaluate(model_id, revision):
341
 
342
  # Load the agent
343
  try:
344
- agent = torch.jit.load(agent_path).to("cuda")
345
  except Exception as e:
346
  logger.error(f"Error loading agent: {e}")
347
  return None
348
 
349
  # Evaluate the agent on the environments
350
- for env_id in env_ids:
351
- envs = gym.vector.SyncVectorEnv([make(env_id) for _ in range(1)])
352
- observations, _ = envs.reset()
353
- episodic_returns = []
354
- while len(episodic_returns) < NUM_EPISODES:
355
- actions = agent(torch.tensor(observations)).numpy()
356
- observations, _, _, _, infos = envs.step(actions)
357
- if "final_info" in infos:
358
- for info in infos["final_info"]:
359
- if info is None or "episode" not in info:
360
- continue
361
- episodic_returns.append(float(info["episode"]["r"]))
362
-
363
- results[env_id] = {"episodic_returns": episodic_returns}
364
- logger.info(f"Environment {env_id}: {np.mean(episodic_returns)} ± {np.std(episodic_returns)}")
365
- return results
 
 
1
  import os
2
  from typing import Dict, SupportsFloat
3
 
 
302
  return thunk
303
 
304
 
305
+ def evaluate(repo_id, revision, env_id):
306
+ tags = API.model_info(repo_id, revision=revision).tags
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
 
308
  # Check if the agent exists
309
  try:
310
+ agent_path = API.hf_hub_download(repo_id=repo_id, filename="agent.pt")
311
  except EntryNotFoundError:
312
  logger.error("Agent not found")
313
  return None
314
 
315
  # Check safety
316
+ security = next(iter(API.get_paths_info(repo_id, "agent.pt", expand=True))).security
317
  if security is None or "safe" not in security:
318
  logger.warn("Agent safety not available")
319
  # return None
 
323
 
324
  # Load the agent
325
  try:
326
+ agent = torch.jit.load(agent_path)
327
  except Exception as e:
328
  logger.error(f"Error loading agent: {e}")
329
  return None
330
 
331
  # Evaluate the agent on the environments
332
+ envs = gym.vector.SyncVectorEnv([make(env_id) for _ in range(1)])
333
+ observations, _ = envs.reset()
334
+ episodic_returns = []
335
+ while len(episodic_returns) < NUM_EPISODES:
336
+ actions = agent(torch.tensor(observations)).numpy()
337
+ observations, _, _, _, infos = envs.step(actions)
338
+ if "final_info" in infos:
339
+ for info in infos["final_info"]:
340
+ if info is None or "episode" not in info:
341
+ continue
342
+ episodic_returns.append(float(info["episode"]["r"]))
343
+
344
+ return episodic_returns