File size: 2,112 Bytes
20cdc2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from packaging import version
import transformers
if version.parse(transformers.__version__) < version.parse("4.31.0"):
    raise ImportError(
        f"You are using transformers=={transformers.__version__}, but transformers>=4.31.0 is required to use DeciCoder. Please upgrade transformers."
    )
from transformers.models.llama.configuration_llama import LlamaConfig
from transformers.utils import logging


logger = logging.get_logger(__name__)

LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}


class DeciCoderConfig(LlamaConfig):
    r"""
   This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
   model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
   defaults will yield a similar configuration to that of the LLaMA-7B.

   Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
   documentation from [`PretrainedConfig`] for more information.


    Args:
        naive_attention_prefill (`bool`, *optional*, defaults to False):
            Whether to use naive matmul or scaled dot product attention during prefill.
        naive_attention_decode_batched (`bool`, *optional*, defaults to True):
            Whether to use naive matmul or scaled dot product attention during decode for batch_size > 1.
        naive_attention_decode_single (`bool`, *optional*, defaults to False):
            Whether to use naive matmul or scaled dot product attention during decode for batch_size == 1.
       

       ```"""
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        naive_attention_prefill: bool = False,
        naive_attention_decode_batched: bool = True,
        naive_attention_decode_single: bool = False,
        **kwargs,
    ):
        self.naive_attention_prefill = naive_attention_prefill
        self.naive_attention_decode_batched = naive_attention_decode_batched
        self.naive_attention_decode_single = naive_attention_decode_single

        super().__init__(**kwargs,)