michaelfeil
commited on
Commit
·
2ec3559
1
Parent(s):
65217ea
Update README.md
Browse files
README.md
CHANGED
@@ -219,27 +219,11 @@ Speedup inference while reducing memory by 2x-4x using int8 inference in C++ on
|
|
219 |
|
220 |
quantized version of [facebook/nllb-200-3.3B](https://huggingface.co/facebook/nllb-200-3.3B)
|
221 |
```bash
|
222 |
-
pip install
|
223 |
```
|
224 |
|
225 |
```python
|
226 |
-
# from transformers import AutoTokenizer
|
227 |
-
model_name = "michaelfeil/ct2fast-nllb-200-3.3B"
|
228 |
|
229 |
-
|
230 |
-
from hf_hub_ctranslate2 import TranslatorCT2fromHfHub
|
231 |
-
model = TranslatorCT2fromHfHub(
|
232 |
-
# load in int8 on CUDA
|
233 |
-
model_name_or_path=model_name,
|
234 |
-
device="cuda",
|
235 |
-
compute_type="int8_float16",
|
236 |
-
# tokenizer=AutoTokenizer.from_pretrained("{ORG}/{NAME}")
|
237 |
-
)
|
238 |
-
outputs = model.generate(
|
239 |
-
text=["def fibonnaci(", "User: How are you doing? Bot:"],
|
240 |
-
max_length=64,
|
241 |
-
)
|
242 |
-
print(outputs)
|
243 |
```
|
244 |
|
245 |
Checkpoint compatible to [ctranslate2>=3.16.0](https://github.com/OpenNMT/CTranslate2)
|
|
|
219 |
|
220 |
quantized version of [facebook/nllb-200-3.3B](https://huggingface.co/facebook/nllb-200-3.3B)
|
221 |
```bash
|
222 |
+
pip install ctranslate2>=3.16.0
|
223 |
```
|
224 |
|
225 |
```python
|
|
|
|
|
226 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
```
|
228 |
|
229 |
Checkpoint compatible to [ctranslate2>=3.16.0](https://github.com/OpenNMT/CTranslate2)
|