Spaces:
Sleeping
Sleeping
hide params on reclick
Browse files
app.py
CHANGED
@@ -78,52 +78,9 @@ DEFAULT_CODE = dedent(
|
|
78 |
|
79 |
make_gallery_image_buttons_js = """
|
80 |
function load() {
|
81 |
-
class ClassWatcher {
|
82 |
-
|
83 |
-
constructor(targetNode, classToWatch, classAddedCallback, arg) {
|
84 |
-
this.targetNode = targetNode
|
85 |
-
this.classToWatch = classToWatch
|
86 |
-
this.classAddedCallback = classAddedCallback
|
87 |
-
this.arg = arg
|
88 |
-
this.observer = null
|
89 |
-
this.lastClassState = targetNode.classList.contains(this.classToWatch)
|
90 |
-
|
91 |
-
this.init()
|
92 |
-
}
|
93 |
-
|
94 |
-
init() {
|
95 |
-
this.observer = new MutationObserver(this.mutationCallback)
|
96 |
-
this.observe()
|
97 |
-
}
|
98 |
-
|
99 |
-
observe() {
|
100 |
-
this.observer.observe(this.targetNode, { attributes: true })
|
101 |
-
}
|
102 |
-
|
103 |
-
disconnect() {
|
104 |
-
this.observer.disconnect()
|
105 |
-
}
|
106 |
-
|
107 |
-
mutationCallback = mutationsList => {
|
108 |
-
for (let mutation of mutationsList) {
|
109 |
-
if (mutation.type === 'attributes' && mutation.attributeName === 'class') {
|
110 |
-
let currentClassState = mutation.target.classList.contains(this.classToWatch)
|
111 |
-
if(this.lastClassState !== currentClassState) {
|
112 |
-
this.lastClassState = currentClassState
|
113 |
-
if(currentClassState) {
|
114 |
-
this.classAddedCallback(this.arg)
|
115 |
-
}
|
116 |
-
}
|
117 |
-
}
|
118 |
-
}
|
119 |
-
}
|
120 |
-
}
|
121 |
let buttons = document.getElementsByClassName("block-button");
|
122 |
-
function clickButton(i) {
|
123 |
-
buttons[i].click();
|
124 |
-
}
|
125 |
Array.from(document.getElementById("pipeline-gallery").getElementsByClassName("thumbnail-item")).map(
|
126 |
-
(b, i) =>
|
127 |
)
|
128 |
}
|
129 |
"""
|
@@ -147,7 +104,7 @@ tr td {
|
|
147 |
min-height: 600px;
|
148 |
max-height: 600px;
|
149 |
}
|
150 |
-
.
|
151 |
overflow: scroll;
|
152 |
}
|
153 |
"""
|
@@ -164,10 +121,10 @@ def non_empty_list_or_none(input_list: list[str]) -> Optional[list[str]]:
|
|
164 |
|
165 |
|
166 |
with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
|
167 |
-
state = gr.State({"selected_block":
|
168 |
gr.Markdown("# Common Crawl Pipeline Creator")
|
169 |
with gr.Row():
|
170 |
-
with gr.Column():
|
171 |
gallery = gr.Gallery(
|
172 |
blocks,
|
173 |
columns=4,
|
@@ -344,28 +301,31 @@ with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
|
|
344 |
]
|
345 |
|
346 |
with gr.Column():
|
347 |
-
with gr.
|
348 |
-
|
349 |
-
|
350 |
-
with gr.
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
|
|
360 |
|
361 |
|
362 |
gr.Markdown("_powered by [datatrove](https://github.com/huggingface/datatrove)_")
|
363 |
|
364 |
-
def show_block_ui(i):
|
|
|
|
|
365 |
return {**{block_ui: gr.Column(visible=(j == i)) for j, block_ui in enumerate(blocks_uis)}, state: {"selected_block": i}}
|
366 |
|
367 |
for i, button in enumerate(gallery_image_buttons):
|
368 |
-
button.click(partial(show_block_ui, i), outputs=blocks_uis + [state])
|
369 |
|
370 |
|
371 |
inputs = [
|
@@ -505,8 +465,8 @@ with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
|
|
505 |
|
506 |
if num_warc_samples:
|
507 |
yield {
|
508 |
-
output_tab: gr.Tab(f"Output (
|
509 |
-
excluded_tab: gr.Tab(f"Excluded (
|
510 |
output_dataframe: pd.DataFrame({"text": [doc.text for doc in output_docs]}),
|
511 |
**{
|
512 |
excluded_dataframes[type(step_to_run)]: pd.DataFrame({"text": [doc.text for doc in step_to_run.exclusion_writer.docs]})
|
@@ -514,7 +474,7 @@ with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
|
|
514 |
if isinstance(step_to_run, BaseFilter) and type(step_to_run) in excluded_dataframes
|
515 |
},
|
516 |
**{
|
517 |
-
excluded_tabs[type(step_to_run)]: gr.Tab(f"{type(step_to_run).__name__} (
|
518 |
for step_to_run in pipeline_executor.pipeline
|
519 |
if isinstance(step_to_run, BaseFilter) and type(step_to_run) in excluded_dataframes
|
520 |
},
|
@@ -535,8 +495,8 @@ with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
|
|
535 |
},
|
536 |
}
|
537 |
yield {
|
538 |
-
output_tab: gr.Tab(f"Output (
|
539 |
-
excluded_tab: gr.Tab(f"Excluded (
|
540 |
output_dataframe: pd.DataFrame({"text": [doc.text for doc in output_docs]}),
|
541 |
**{
|
542 |
excluded_dataframes[type(step_to_run)]: pd.DataFrame({"text": [doc.text for doc in step_to_run.exclusion_writer.docs]})
|
@@ -544,7 +504,7 @@ with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
|
|
544 |
if isinstance(step_to_run, BaseFilter) and type(step_to_run) in excluded_dataframes
|
545 |
},
|
546 |
**{
|
547 |
-
excluded_tabs[type(step_to_run)]: gr.Tab(f"{type(step_to_run).__name__} (
|
548 |
for step_to_run in pipeline_executor.pipeline
|
549 |
if isinstance(step_to_run, BaseFilter) and type(step_to_run) in excluded_dataframes
|
550 |
},
|
|
|
78 |
|
79 |
make_gallery_image_buttons_js = """
|
80 |
function load() {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
let buttons = document.getElementsByClassName("block-button");
|
|
|
|
|
|
|
82 |
Array.from(document.getElementById("pipeline-gallery").getElementsByClassName("thumbnail-item")).map(
|
83 |
+
(b, i) => b.addEventListener("click", () => buttons[i].click())
|
84 |
)
|
85 |
}
|
86 |
"""
|
|
|
104 |
min-height: 600px;
|
105 |
max-height: 600px;
|
106 |
}
|
107 |
+
.scollabe_tabs .tab-wrapper .tab-container {
|
108 |
overflow: scroll;
|
109 |
}
|
110 |
"""
|
|
|
121 |
|
122 |
|
123 |
with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
|
124 |
+
state = gr.State({"selected_block": None})
|
125 |
gr.Markdown("# Common Crawl Pipeline Creator")
|
126 |
with gr.Row():
|
127 |
+
with gr.Column(min_width=640):
|
128 |
gallery = gr.Gallery(
|
129 |
blocks,
|
130 |
columns=4,
|
|
|
301 |
]
|
302 |
|
303 |
with gr.Column():
|
304 |
+
with gr.Tabs(elem_classes="scollabe_tabs"):
|
305 |
+
with gr.Tab("Output (and % of data)") as output_tab:
|
306 |
+
output_dataframe = gr.DataFrame(datatype="markdown")
|
307 |
+
with gr.Tab("Excluded (and % of data)") as excluded_tab:
|
308 |
+
with gr.Tabs(elem_classes="scollabe_tabs"):
|
309 |
+
excluded_dataframes: dict[Type, gr.DataFrame] = {}
|
310 |
+
excluded_tabs: dict[Type, gr.Tab] = {}
|
311 |
+
for step in steps:
|
312 |
+
if issubclass(step, BaseFilter) and step is not URLFilter:
|
313 |
+
with gr.Tab(step.__name__ + " (and % of data)") as t:
|
314 |
+
excluded_dataframes[step] = gr.DataFrame(datatype="markdown")
|
315 |
+
excluded_tabs[step] = t
|
316 |
+
with gr.Tab("Python code") as code_tab:
|
317 |
+
python_code_markdown = gr.Markdown(DEFAULT_CODE)
|
318 |
|
319 |
|
320 |
gr.Markdown("_powered by [datatrove](https://github.com/huggingface/datatrove)_")
|
321 |
|
322 |
+
def show_block_ui(i, current_state: dict):
|
323 |
+
if i == current_state.get("selected_block"):
|
324 |
+
i = None
|
325 |
return {**{block_ui: gr.Column(visible=(j == i)) for j, block_ui in enumerate(blocks_uis)}, state: {"selected_block": i}}
|
326 |
|
327 |
for i, button in enumerate(gallery_image_buttons):
|
328 |
+
button.click(partial(show_block_ui, i), inputs=[state], outputs=blocks_uis + [state])
|
329 |
|
330 |
|
331 |
inputs = [
|
|
|
465 |
|
466 |
if num_warc_samples:
|
467 |
yield {
|
468 |
+
output_tab: gr.Tab(f"Output ({len(output_docs)/num_warc_samples*100:.03f}%)"),
|
469 |
+
excluded_tab: gr.Tab(f"Excluded ({100 - len(output_docs)/num_warc_samples*100:.03f}%)"),
|
470 |
output_dataframe: pd.DataFrame({"text": [doc.text for doc in output_docs]}),
|
471 |
**{
|
472 |
excluded_dataframes[type(step_to_run)]: pd.DataFrame({"text": [doc.text for doc in step_to_run.exclusion_writer.docs]})
|
|
|
474 |
if isinstance(step_to_run, BaseFilter) and type(step_to_run) in excluded_dataframes
|
475 |
},
|
476 |
**{
|
477 |
+
excluded_tabs[type(step_to_run)]: gr.Tab(f"{type(step_to_run).__name__} ({len(step_to_run.exclusion_writer.docs)/num_warc_samples*100:.03f}%)")
|
478 |
for step_to_run in pipeline_executor.pipeline
|
479 |
if isinstance(step_to_run, BaseFilter) and type(step_to_run) in excluded_dataframes
|
480 |
},
|
|
|
495 |
},
|
496 |
}
|
497 |
yield {
|
498 |
+
output_tab: gr.Tab(f"Output ({len(output_docs)/num_warc_samples*100:.03f}%)"),
|
499 |
+
excluded_tab: gr.Tab(f"Excluded ({100 - len(output_docs)/num_warc_samples*100:.03f}%)"),
|
500 |
output_dataframe: pd.DataFrame({"text": [doc.text for doc in output_docs]}),
|
501 |
**{
|
502 |
excluded_dataframes[type(step_to_run)]: pd.DataFrame({"text": [doc.text for doc in step_to_run.exclusion_writer.docs]})
|
|
|
504 |
if isinstance(step_to_run, BaseFilter) and type(step_to_run) in excluded_dataframes
|
505 |
},
|
506 |
**{
|
507 |
+
excluded_tabs[type(step_to_run)]: gr.Tab(f"{type(step_to_run).__name__} ({len(step_to_run.exclusion_writer.docs)/num_warc_samples*100:.03f}%)")
|
508 |
for step_to_run in pipeline_executor.pipeline
|
509 |
if isinstance(step_to_run, BaseFilter) and type(step_to_run) in excluded_dataframes
|
510 |
},
|