dyang415 commited on
Commit
e0f91a5
·
verified ·
1 Parent(s): 4323452

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/test.jsonl filter=lfs diff=lfs merge=lfs -text
37
+ data/train.jsonl filter=lfs diff=lfs merge=lfs -text
data/images/1.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20813acc6638870c524e8b5246aae58d537e1a8f3548b7a53faa7993be014725
3
+ size 713968566
data/images/10.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f34cb4bcca9e00495d224329d3e37c06fa753b1ee0b59c742ab9de81843457dd
3
+ size 678272171
data/images/2.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22255d02908c1e757b8fd5039d868afbc03d189c074aa602117dfab2bb1ee3ea
3
+ size 720714560
data/images/3.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce90ca867bb0cc3f19954ad40ac9422ba590f44be0a5900239e510f089e80ac3
3
+ size 678512965
data/images/4.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5a0a2babe3226114406a93655b76f682b9d7c1ae1cff28bc5c13ec9097170e0
3
+ size 665420259
data/images/5.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9be0ce29f958d2dbe6238f5d5e8e4991297146205e368c00ae3b7239c1e419f5
3
+ size 724024231
data/images/6.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:475cf7ef8a8e24c0dd390052b2361fe0df358fe4f7dc2207af5bad9fa1c5e1de
3
+ size 754277919
data/images/7.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e5e70646488430221d06c665b52e11ff5d8f7a7f6588f819b9b74781ed5675f
3
+ size 682945336
data/images/8.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:921246d90f1bfc905de9adac17114a960e0c395ef56206ca2cc92f6faecbdefa
3
+ size 755881389
data/images/9.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c9656453a518dc8729e041fdd68a3524933a0f5929ff68e8aa359b5d03a5c83
3
+ size 731803010
data/test.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad04eb2fb3417f6127cefa77634040b1def227e1be68fd3b6760422aa488361a
3
+ size 13480332
data/train.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:267ae5ee1c91d27a18cebfd9d4ecafc80b0ca89c665bdf4ae339812bff3d3bfd
3
+ size 301314134
skyvern-gpt-4o.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import datasets
3
+ import os
4
+
5
+ logger = datasets.logging.get_logger(__name__)
6
+
7
+
8
+ class Dataset(datasets.GeneratorBasedBuilder):
9
+ def _info(self):
10
+ return datasets.DatasetInfo(
11
+ features=datasets.Features({
12
+ "images": datasets.Sequence(datasets.Image()),
13
+ "length": datasets.Value(dtype="int32"),
14
+ "conversations": datasets.Sequence(datasets.Features({
15
+ "from": datasets.Value("string"),
16
+ "value": datasets.Value("string")
17
+ })),
18
+ "task_name": datasets.Value("string"),
19
+ "step_name": datasets.Value("string"),
20
+ "has_retry": datasets.Value("bool"),
21
+ "retry_index": datasets.Value("int32"),
22
+ "total_retries": datasets.Value("int32"),
23
+ "task_num_steps": datasets.Value("int32"),
24
+ "task_has_solve_captcha": datasets.Value("bool"),
25
+ })
26
+ )
27
+
28
+ def _split_generators(self, dl_manager: datasets.DownloadManager):
29
+ dl_manager.download_config.token = True
30
+ dl_manager.download_config.num_proc = 10
31
+
32
+ base_url = "https://huggingface.co/datasets/empower-dev-staging/skyvern-v0/resolve/main/data"
33
+ image_files = dl_manager.download_and_extract(
34
+ [f"{base_url}/images/{i + 1}.tar.gz" for i in range(10)])
35
+
36
+ image_file_to_full_path_mapping = dict([
37
+ ('images/' + '/'.join(image_file.split('/')[-2:]), image_file) for image_file in dl_manager.iter_files(image_files)
38
+ ])
39
+
40
+ return [
41
+ datasets.SplitGenerator(
42
+ name=datasets.Split.TRAIN,
43
+ gen_kwargs={
44
+ "filepath": dl_manager.download_and_extract(
45
+ f"{base_url}/train.jsonl"),
46
+ "image_file_to_full_path_mapping": image_file_to_full_path_mapping
47
+ },
48
+ ),
49
+ datasets.SplitGenerator(
50
+ name=datasets.Split.TEST,
51
+ gen_kwargs={
52
+ "filepath": dl_manager.download_and_extract(
53
+ f"{base_url}/test.jsonl"),
54
+ "image_file_to_full_path_mapping": image_file_to_full_path_mapping
55
+ },
56
+ ),
57
+ ]
58
+
59
+ def _get_step_info(self, item):
60
+ first_image_path = item['images'][0]
61
+ folder = '/'.join(first_image_path.split('/')[-2:-1])
62
+
63
+ task = folder.split('-')[0]
64
+ step = folder.split('-')[1].split('_')
65
+
66
+ step_number = step[0]
67
+ retry_index = int(step[1])
68
+
69
+ return {
70
+ "task_name": task,
71
+ "step_name": f"{task}-{step_number}",
72
+ "retry_index": retry_index
73
+ }
74
+
75
+ def _generate_examples(self, filepath, image_file_to_full_path_mapping):
76
+ with open(filepath, "r") as f:
77
+ lines = f.readlines()
78
+
79
+ items = []
80
+ step_name_to_retry_indices = {}
81
+ task_name_to_num_steps = {}
82
+ task_name_to_having_solve_captcha = {}
83
+ for id, line in enumerate(lines):
84
+ item = json.loads(line)
85
+ actions = json.loads(item["conversations"][1]["value"])[
86
+ "actions"]
87
+ if len(actions) == 0:
88
+ continue
89
+
90
+ items.append(item)
91
+
92
+ step_info = self._get_step_info(item)
93
+ step_name = step_info["step_name"]
94
+ task_name = step_info["task_name"]
95
+
96
+ if task_name not in task_name_to_having_solve_captcha:
97
+ task_name_to_having_solve_captcha[task_name] = False
98
+ if any(action["action_type"].lower() == "solve_captcha" for action in actions):
99
+ task_name_to_having_solve_captcha[task_name] = True
100
+
101
+ if step_name not in step_name_to_retry_indices:
102
+ step_name_to_retry_indices[step_name] = []
103
+ task_name_to_num_steps[task_name] = task_name_to_num_steps.get(
104
+ task_name, 0) + 1
105
+ step_name_to_retry_indices[step_name].append(
106
+ step_info["retry_index"])
107
+
108
+ step_name_to_retry_indices = dict([
109
+ (step_name, sorted(retry_indices)) for (step_name, retry_indices) in step_name_to_retry_indices.items()
110
+ ])
111
+
112
+ for id, item in enumerate(items):
113
+ step_info = self._get_step_info(item)
114
+ retry_indices = step_name_to_retry_indices[step_info['step_name']]
115
+ yield id, {
116
+ "images": [
117
+ image_file_to_full_path_mapping[image] for image in item["images"]
118
+ ],
119
+ "conversations": item["conversations"],
120
+ "length": item["length"],
121
+ "task_name": step_info["task_name"],
122
+ "step_name": step_info["step_name"],
123
+ "has_retry": len(retry_indices) > 1,
124
+ "retry_index": retry_indices.index(step_info["retry_index"]),
125
+ "total_retries": len(retry_indices),
126
+ "task_num_steps": task_name_to_num_steps[step_info["task_name"]],
127
+ "task_has_solve_captcha": task_name_to_having_solve_captcha[step_info["task_name"]],
128
+ }