8bit-coder commited on
Commit
51d42b1
·
1 Parent(s): 6839937

Upload training_files

Browse files
training_files/alpaca-megaset-fixed.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd16fa0cb1e2402ab5839ec2231ceacf8062070cd750b50b879e74cb16603d3e
3
+ size 30418704
training_files/convert-hf-to-pth-16b.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Convert hf to pth
2
+ import os
3
+ import json
4
+
5
+ import torch
6
+ from transformers import LlamaTokenizer, LlamaForCausalLM
7
+
8
+ tokenizer = LlamaTokenizer.from_pretrained("./llama-7b-hf")
9
+
10
+ base_model = LlamaForCausalLM.from_pretrained(
11
+ "output_7b",
12
+ load_in_8bit=False,
13
+ torch_dtype=torch.float16,
14
+ device_map={"": "cpu"},
15
+ )
16
+
17
+ base_model_sd = base_model.state_dict()
18
+
19
+ params = {
20
+ "dim": 4096,
21
+ "multiple_of": 256,
22
+ "n_heads": 32,
23
+ "n_layers": 32,
24
+ "norm_eps": 1e-06,
25
+ "vocab_size": -1,
26
+ }
27
+ n_layers = params["n_layers"]
28
+ n_heads = params["n_heads"]
29
+ dim = params["dim"]
30
+ dims_per_head = dim // n_heads
31
+ base = 10000.0
32
+ inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
33
+
34
+
35
+ def permute(w):
36
+ return (
37
+ w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
38
+ )
39
+
40
+
41
+ def unpermute(w):
42
+ return (
43
+ w.view(n_heads, 2, dim // n_heads // 2, dim).transpose(1, 2).reshape(dim, dim)
44
+ )
45
+
46
+
47
+ def translate_state_dict_key(k):
48
+ k = k.replace("base_model.model.", "")
49
+ if k == "model.embed_tokens.weight":
50
+ return "tok_embeddings.weight"
51
+ elif k == "model.norm.weight":
52
+ return "norm.weight"
53
+ elif k == "lm_head.weight":
54
+ return "output.weight"
55
+ elif k.startswith("model.layers."):
56
+ layer = k.split(".")[2]
57
+ if k.endswith(".self_attn.q_proj.weight"):
58
+ return f"layers.{layer}.attention.wq.weight"
59
+ elif k.endswith(".self_attn.k_proj.weight"):
60
+ return f"layers.{layer}.attention.wk.weight"
61
+ elif k.endswith(".self_attn.v_proj.weight"):
62
+ return f"layers.{layer}.attention.wv.weight"
63
+ elif k.endswith(".self_attn.o_proj.weight"):
64
+ return f"layers.{layer}.attention.wo.weight"
65
+ elif k.endswith(".mlp.gate_proj.weight"):
66
+ return f"layers.{layer}.feed_forward.w1.weight"
67
+ elif k.endswith(".mlp.down_proj.weight"):
68
+ return f"layers.{layer}.feed_forward.w2.weight"
69
+ elif k.endswith(".mlp.up_proj.weight"):
70
+ return f"layers.{layer}.feed_forward.w3.weight"
71
+ elif k.endswith(".input_layernorm.weight"):
72
+ return f"layers.{layer}.attention_norm.weight"
73
+ elif k.endswith(".post_attention_layernorm.weight"):
74
+ return f"layers.{layer}.ffn_norm.weight"
75
+ elif k.endswith("rotary_emb.inv_freq") or "lora" in k:
76
+ return None
77
+ else:
78
+ print(layer, k)
79
+ raise NotImplementedError
80
+ else:
81
+ print(k)
82
+ raise NotImplementedError
83
+
84
+
85
+ new_state_dict = {}
86
+ for k, v in base_model_sd.items():
87
+ new_k = translate_state_dict_key(k)
88
+ if new_k is not None:
89
+ if "wq" in new_k or "wk" in new_k:
90
+ new_state_dict[new_k] = unpermute(v)
91
+ else:
92
+ new_state_dict[new_k] = v
93
+
94
+ torch.save(new_state_dict, "consolidated.00.pth")
95
+
96
+ with open("params.json", "w") as f:
97
+ json.dump(params, f)
98
+
99
+ #Resize tensors
100
+ model = torch.load("consolidated.00.pth", map_location=torch.device('cpu'))
101
+ x = model["tok_embeddings.weight"]
102
+ y = model["output.weight"]
103
+ row_exclude = 32000
104
+ x = x[:row_exclude]
105
+ y = y[:row_exclude]
106
+ model["tok_embeddings.weight"] = x
107
+ model["output.weight"] = y
108
+ torch.save(model, "consolidated.01.pth")
109
+ #Delete consolidated.00.pth and rename consolidated.01.pth into consolidated.00.pth
training_files/convert-hf-to-pth-32b.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Convert hf to pth
2
+ import os
3
+ import json
4
+
5
+ import torch
6
+ from transformers import LlamaTokenizer, LlamaForCausalLM
7
+
8
+ tokenizer = LlamaTokenizer.from_pretrained("./llama-7b-hf")
9
+
10
+ base_model = LlamaForCausalLM.from_pretrained(
11
+ "output_7b",
12
+ load_in_8bit=False,
13
+ torch_dtype=torch.float16,
14
+ device_map={"": "cpu"},
15
+ )
16
+
17
+ base_model_sd = base_model.state_dict()
18
+
19
+ params = {
20
+ "dim": 4096,
21
+ "multiple_of": 256,
22
+ "n_heads": 32,
23
+ "n_layers": 32,
24
+ "norm_eps": 1e-06,
25
+ "vocab_size": -1,
26
+ }
27
+ n_layers = params["n_layers"]
28
+ n_heads = params["n_heads"]
29
+ dim = params["dim"]
30
+ dims_per_head = dim // n_heads
31
+ base = 10000.0
32
+ inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
33
+
34
+
35
+ def permute(w):
36
+ return (
37
+ w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
38
+ )
39
+
40
+
41
+ def unpermute(w):
42
+ return (
43
+ w.view(n_heads, 2, dim // n_heads // 2, dim).transpose(1, 2).reshape(dim, dim)
44
+ )
45
+
46
+
47
+ def translate_state_dict_key(k):
48
+ k = k.replace("base_model.model.", "")
49
+ if k == "model.embed_tokens.weight":
50
+ return "tok_embeddings.weight"
51
+ elif k == "model.norm.weight":
52
+ return "norm.weight"
53
+ elif k == "lm_head.weight":
54
+ return "output.weight"
55
+ elif k.startswith("model.layers."):
56
+ layer = k.split(".")[2]
57
+ if k.endswith(".self_attn.q_proj.weight"):
58
+ return f"layers.{layer}.attention.wq.weight"
59
+ elif k.endswith(".self_attn.k_proj.weight"):
60
+ return f"layers.{layer}.attention.wk.weight"
61
+ elif k.endswith(".self_attn.v_proj.weight"):
62
+ return f"layers.{layer}.attention.wv.weight"
63
+ elif k.endswith(".self_attn.o_proj.weight"):
64
+ return f"layers.{layer}.attention.wo.weight"
65
+ elif k.endswith(".mlp.gate_proj.weight"):
66
+ return f"layers.{layer}.feed_forward.w1.weight"
67
+ elif k.endswith(".mlp.down_proj.weight"):
68
+ return f"layers.{layer}.feed_forward.w2.weight"
69
+ elif k.endswith(".mlp.up_proj.weight"):
70
+ return f"layers.{layer}.feed_forward.w3.weight"
71
+ elif k.endswith(".input_layernorm.weight"):
72
+ return f"layers.{layer}.attention_norm.weight"
73
+ elif k.endswith(".post_attention_layernorm.weight"):
74
+ return f"layers.{layer}.ffn_norm.weight"
75
+ elif k.endswith("rotary_emb.inv_freq") or "lora" in k:
76
+ return None
77
+ else:
78
+ print(layer, k)
79
+ raise NotImplementedError
80
+ else:
81
+ print(k)
82
+ raise NotImplementedError
83
+
84
+
85
+ new_state_dict = {}
86
+ for k, v in base_model_sd.items():
87
+ new_k = translate_state_dict_key(k)
88
+ if new_k is not None:
89
+ if "wq" in new_k or "wk" in new_k:
90
+ new_state_dict[new_k] = unpermute(v)
91
+ else:
92
+ new_state_dict[new_k] = v
93
+
94
+ torch.save(new_state_dict, "consolidated.00.pth")
95
+
96
+ with open("params.json", "w") as f:
97
+ json.dump(params, f)
training_files/dataset_validator.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ print("This program will validate the JSON training data.")
4
+
5
+ file = input("Enter the file name with extension: ")
6
+
7
+ # Load the JSON file
8
+ with open(file, "r", encoding="utf8") as f:
9
+ data = json.load(f)
10
+
11
+ # Check each item in the JSON file
12
+ for item in data:
13
+ if "instruction" not in item or "input" not in item or "output" not in item:
14
+ print("Error: Missing key in JSON item.")
15
+ print(item)
16
+
17
+ print("File done. ")
training_files/full-training-instructions.txt ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.1.0-1-Linux-x86_64.sh
2
+
3
+ bash Miniconda3-py310_23.1.0-1-Linux-x86_64.sh
4
+
5
+ enter, enter, yes, defaults
6
+
7
+ sudo reboot
8
+
9
+ conda activate
10
+ conda create -n alpaca python=3.10
11
+ conda activate alpaca
12
+
13
+ export PATH="/home/ubuntu/miniconda3/envs/alpaca/bin:$PATH"
14
+
15
+ sudo apt-get install git-lfs
16
+ git lfs install
17
+
18
+ git clone https://github.com/tatsu-lab/stanford_alpaca
19
+
20
+ git clone https://huggingface.co/decapoda-research/llama-7b-hf
21
+ #remember to edit the tokenizer_config.json from LLaMATokenizer to LlamaTokenizer
22
+
23
+ git clone https://huggingface.co/8bit-coder/alpaca-7b-nativeEnhanced
24
+
25
+ pip install sentencepiece
26
+ pip install git+https://github.com/huggingface/transformers.git
27
+
28
+ cd ./stanford_alpaca
29
+
30
+ pip install -r requirements.txt
31
+
32
+ cd ..
33
+
34
+ torchrun --nproc_per_node=8 --master_port=3045 ./stanford_alpaca/train.py --model_name_or_path ./llama-7b-hf --data_path ./alpaca-7b-nativeEnhanced/training_files/alpaca-megaset-fixed.json --fp16 True --output_dir ./output_7b --num_train_epochs 3 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --gradient_accumulation_steps 16 --evaluation_strategy "no" --save_strategy "steps" --save_steps 200 --learning_rate 2e-5 --weight_decay 0. --warmup_ratio 0.03 --lr_scheduler_type "cosine" --logging_steps 1 --fsdp "full_shard auto_wrap" --fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' --tf32 True
35
+
36
+ # now, make sure with nano that script1.py has proper paths to everything
37
+
38
+ pip install -q datasets loralib sentencepiece
39
+ pip install bitsandbytes
40
+
41
+ python script1.py
42
+
43
+ git clone https://github.com/antimatter15/alpaca.cpp
44
+
45
+ cd alpaca.cpp
46
+ mkdir models
47
+ cd ..
48
+
49
+ mv consolidated.01.pth ./alpaca.cpp/models/consolidated.00.pth
50
+ mv params.json ./alpaca.cpp/models/params.json
51
+ mv output_13b/tokenizer.model ./alpaca.cpp/models/tokenizer.model
52
+
53
+ cd alpaca.cpp
54
+
55
+ make
56
+
57
+ cd ..
58
+
59
+ python .deez/convert-pth-to-ggml.py ./alpaca.cpp/models 2 (1 for 7b, 2 for 13b, and the rest you can check yourself ;)
60
+
61
+ cd alpaca.cpp
62
+
63
+ ./quantize models/ggml-model-f16.bin ggml-alpaca-13b-nativeEnhanced-q4.bin 2
64
+
65
+ there's your finished model!