update readme
Browse files- README.md +6 -21
- assets/demo.wav +3 -0
README.md
CHANGED
@@ -1013,11 +1013,12 @@ def get_video_chunk_content(video_path, flatten=True):
|
|
1013 |
return contents
|
1014 |
|
1015 |
video_path="/path/to/video"
|
1016 |
-
sys_msg = model.get_sys_prompt(mode='omni', language='en')
|
1017 |
# if use voice clone prompt, please set ref_audio
|
1018 |
-
|
1019 |
-
|
1020 |
-
|
|
|
|
|
1021 |
|
1022 |
contents = get_video_chunk_content(video_path)
|
1023 |
msg = {"role":"user", "content": contents}
|
@@ -1122,7 +1123,7 @@ res = model.chat(
|
|
1122 |
<details> <summary>Click to view the Python code for enabling MiniCPM-o 2.6 to interact with you in a specified voice.</summary>
|
1123 |
|
1124 |
```python
|
1125 |
-
ref_audio, _ = librosa.load('
|
1126 |
|
1127 |
# Audio RolePlay: # With this mode, model will role-play the character based on the audio prompt.
|
1128 |
sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_roleplay', language='en')
|
@@ -1135,14 +1136,10 @@ user_question = {'role': 'user', 'content': [librosa.load('xxx.wav', sr=16000, m
|
|
1135 |
```python
|
1136 |
msgs = [sys_prompt, user_question]
|
1137 |
res = model.chat(
|
1138 |
-
image=None,
|
1139 |
msgs=msgs,
|
1140 |
-
context=None,
|
1141 |
tokenizer=tokenizer,
|
1142 |
sampling=True,
|
1143 |
max_new_tokens=128,
|
1144 |
-
stream=False,
|
1145 |
-
stream_input=True,
|
1146 |
use_tts_template=True,
|
1147 |
generate_audio=True,
|
1148 |
temperature=0.3,
|
@@ -1154,14 +1151,10 @@ history = msgs.append({'role': 'assistant', 'content': res})
|
|
1154 |
user_question = {'role': 'user', 'content': [librosa.load('xxx.wav', sr=16000, mono=True)[0]]}
|
1155 |
msgs = history.append(user_question)
|
1156 |
res = model.chat(
|
1157 |
-
image=None,
|
1158 |
msgs=msgs,
|
1159 |
-
context=None,
|
1160 |
tokenizer=tokenizer,
|
1161 |
sampling=True,
|
1162 |
max_new_tokens=128,
|
1163 |
-
stream=False,
|
1164 |
-
stream_input=True,
|
1165 |
use_tts_template=True,
|
1166 |
generate_audio=True,
|
1167 |
temperature=0.3,
|
@@ -1193,14 +1186,10 @@ audio_input, _ = librosa.load('xxx.wav', sr=16000, mono=True)
|
|
1193 |
msgs = [{'role': 'user', 'content': [task_prompt,audio_input]}]
|
1194 |
|
1195 |
res = model.chat(
|
1196 |
-
image=None,
|
1197 |
msgs=msgs,
|
1198 |
-
context=None,
|
1199 |
tokenizer=tokenizer,
|
1200 |
sampling=True,
|
1201 |
max_new_tokens=128,
|
1202 |
-
stream=False,
|
1203 |
-
stream_input=True,
|
1204 |
use_tts_template=True,
|
1205 |
generate_audio=True,
|
1206 |
temperature=0.3,
|
@@ -1230,14 +1219,10 @@ msgs = [{'role': 'user', 'content': [task_prompt]}] # you can try to use the sam
|
|
1230 |
|
1231 |
msgs = [sys_prompt, user_question]
|
1232 |
res = model.chat(
|
1233 |
-
image=None,
|
1234 |
msgs=msgs,
|
1235 |
-
context=None,
|
1236 |
tokenizer=tokenizer,
|
1237 |
sampling=True,
|
1238 |
max_new_tokens=128,
|
1239 |
-
stream=False,
|
1240 |
-
stream_input=True,
|
1241 |
use_tts_template=True,
|
1242 |
generate_audio=True,
|
1243 |
temperature=0.3,
|
|
|
1013 |
return contents
|
1014 |
|
1015 |
video_path="/path/to/video"
|
|
|
1016 |
# if use voice clone prompt, please set ref_audio
|
1017 |
+
ref_audio_path = 'assets/demo.wav'
|
1018 |
+
ref_audio, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
|
1019 |
+
sys_msg = model.get_sys_prompt(ref_audio=ref_audio, mode='omni', language='en')
|
1020 |
+
# or use default prompt
|
1021 |
+
# sys_msg = model.get_sys_prompt(mode='omni', language='en')
|
1022 |
|
1023 |
contents = get_video_chunk_content(video_path)
|
1024 |
msg = {"role":"user", "content": contents}
|
|
|
1123 |
<details> <summary>Click to view the Python code for enabling MiniCPM-o 2.6 to interact with you in a specified voice.</summary>
|
1124 |
|
1125 |
```python
|
1126 |
+
ref_audio, _ = librosa.load('assets/demo.wav', sr=16000, mono=True) # load the reference audio
|
1127 |
|
1128 |
# Audio RolePlay: # With this mode, model will role-play the character based on the audio prompt.
|
1129 |
sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_roleplay', language='en')
|
|
|
1136 |
```python
|
1137 |
msgs = [sys_prompt, user_question]
|
1138 |
res = model.chat(
|
|
|
1139 |
msgs=msgs,
|
|
|
1140 |
tokenizer=tokenizer,
|
1141 |
sampling=True,
|
1142 |
max_new_tokens=128,
|
|
|
|
|
1143 |
use_tts_template=True,
|
1144 |
generate_audio=True,
|
1145 |
temperature=0.3,
|
|
|
1151 |
user_question = {'role': 'user', 'content': [librosa.load('xxx.wav', sr=16000, mono=True)[0]]}
|
1152 |
msgs = history.append(user_question)
|
1153 |
res = model.chat(
|
|
|
1154 |
msgs=msgs,
|
|
|
1155 |
tokenizer=tokenizer,
|
1156 |
sampling=True,
|
1157 |
max_new_tokens=128,
|
|
|
|
|
1158 |
use_tts_template=True,
|
1159 |
generate_audio=True,
|
1160 |
temperature=0.3,
|
|
|
1186 |
msgs = [{'role': 'user', 'content': [task_prompt,audio_input]}]
|
1187 |
|
1188 |
res = model.chat(
|
|
|
1189 |
msgs=msgs,
|
|
|
1190 |
tokenizer=tokenizer,
|
1191 |
sampling=True,
|
1192 |
max_new_tokens=128,
|
|
|
|
|
1193 |
use_tts_template=True,
|
1194 |
generate_audio=True,
|
1195 |
temperature=0.3,
|
|
|
1219 |
|
1220 |
msgs = [sys_prompt, user_question]
|
1221 |
res = model.chat(
|
|
|
1222 |
msgs=msgs,
|
|
|
1223 |
tokenizer=tokenizer,
|
1224 |
sampling=True,
|
1225 |
max_new_tokens=128,
|
|
|
|
|
1226 |
use_tts_template=True,
|
1227 |
generate_audio=True,
|
1228 |
temperature=0.3,
|
assets/demo.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d0b347d8ed0b2314c0d175fbdac79f6f3f91a6402bd7492ac5c860646a2ba309
|
3 |
+
size 1454196
|