Spaces:
Build error
Build error
import gradio as gr | |
import numpy as np | |
import torch | |
from transformers import pipeline, Pipeline | |
from transformers.pipelines import PIPELINE_REGISTRY, FillMaskPipeline | |
from transformers import AutoConfig, AutoModel, AutoModelForMaskedLM | |
unmasker = pipeline("fill-mask", model="anferico/bert-for-patents") | |
# unmasker = pipeline("temp-scale", model="anferico/bert-for-patents") | |
example = 'A crustless [MASK] made from two slices of baked bread' | |
example_dict = {} | |
example_dict['input_ids'] = example | |
def add_mask(text, size=1): | |
split_text = text.split() | |
idx = np.random.randint(len(split_text), size=size) | |
for i in idx: | |
split_text[i] = '[MASK]' | |
return ' '.join(split_text) | |
class TempScalePipe(FillMaskPipeline): | |
def postprocess(self, model_outputs, top_k=3, target_ids=None): | |
# Cap top_k if there are targets | |
if target_ids is not None and target_ids.shape[0] < top_k: | |
top_k = target_ids.shape[0] | |
input_ids = model_outputs["input_ids"][0] | |
outputs = model_outputs["logits"] | |
masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False).squeeze(-1) | |
# Fill mask pipeline supports only one ${mask_token} per sample | |
logits = outputs[0, masked_index, :] / 1e1 | |
probs = logits.softmax(dim=-1) | |
indices = torch.multinomial(probs, num_samples=3) | |
probs = probs[indices] | |
if target_ids is not None: | |
probs = probs[..., target_ids] | |
values, predictions = probs.topk(top_k) | |
result = [] | |
single_mask = values.shape[0] == 1 | |
for i, (_values, _predictions) in enumerate(zip(values.tolist(), predictions.tolist())): | |
row = [] | |
for v, p in zip(_values, _predictions): | |
# Copy is important since we're going to modify this array in place | |
tokens = input_ids.numpy().copy() | |
if target_ids is not None: | |
p = target_ids[p].tolist() | |
tokens[masked_index[i]] = p | |
# Filter padding out: | |
tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)] | |
# Originally we skip special tokens to give readable output. | |
# For multi masks though, the other [MASK] would be removed otherwise | |
# making the output look odd, so we add them back | |
sequence = self.tokenizer.decode(tokens, skip_special_tokens=single_mask) | |
proposition = {"score": v, "token": p, "token_str": self.tokenizer.decode([p]), "sequence": sequence} | |
row.append(proposition) | |
result.append(row) | |
if single_mask: | |
return result[0] | |
return result | |
PIPELINE_REGISTRY.register_pipeline( | |
"temp-scale", | |
pipeline_class=TempScalePipe, | |
pt_model=AutoModelForMaskedLM, | |
) | |
def unmask(text): | |
# text = add_mask(text) | |
res = unmasker(text) | |
out = {item["token_str"]: item["score"] for item in res} | |
return out | |
textbox = gr.Textbox(label="Type language here", lines=5) | |
# import gradio as gr | |
from transformers import pipeline, Pipeline | |
# unmasker = pipeline("fill-mask", model="anferico/bert-for-patents") | |
# | |
# | |
# | |
# | |
# def unmask(text): | |
# text = add_mask(text) | |
# res = unmasker(text) | |
# out = {item["token_str"]: item["score"] for item in res} | |
# return out | |
# | |
# | |
# textbox = gr.Textbox(label="Type language here", lines=5) | |
# | |
demo = gr.Interface( | |
fn=unmask, | |
inputs=textbox, | |
outputs="label", | |
examples=[example], | |
) | |
demo.launch() | |