jbilcke-hf HF staff commited on
Commit
7249a2e
·
1 Parent(s): 6caeb80

add support for whisper-turbo (base)

Browse files
package-lock.json CHANGED
The diff for this file is too large to render. See raw diff
 
package.json CHANGED
@@ -24,6 +24,7 @@
24
  "@radix-ui/react-separator": "^1.0.3",
25
  "@radix-ui/react-slot": "^1.0.2",
26
  "@radix-ui/react-switch": "^1.0.3",
 
27
  "@radix-ui/react-tooltip": "^1.0.6",
28
  "@react-pdf/renderer": "^3.1.12",
29
  "@types/node": "20.4.2",
@@ -59,7 +60,9 @@
59
  "tts-react": "^3.0.1",
60
  "typescript": "5.1.6",
61
  "usehooks-ts": "^2.9.1",
62
- "uuid": "^9.0.0"
 
 
63
  },
64
  "devDependencies": {
65
  "@types/sbd": "^1.0.3"
 
24
  "@radix-ui/react-separator": "^1.0.3",
25
  "@radix-ui/react-slot": "^1.0.2",
26
  "@radix-ui/react-switch": "^1.0.3",
27
+ "@radix-ui/react-toast": "^1.1.4",
28
  "@radix-ui/react-tooltip": "^1.0.6",
29
  "@react-pdf/renderer": "^3.1.12",
30
  "@types/node": "20.4.2",
 
60
  "tts-react": "^3.0.1",
61
  "typescript": "5.1.6",
62
  "usehooks-ts": "^2.9.1",
63
+ "uuid": "^9.0.0",
64
+ "webm-to-wav-converter": "^1.1.0",
65
+ "whisper-turbo": "^0.7.0"
66
  },
67
  "devDependencies": {
68
  "@types/sbd": "^1.0.3"
src/app/engine/listen.ts ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "use server"
2
+
3
+ import { SoundAnalysisRequest, SoundAnalysisResponse } from "@/types"
4
+
5
+ const apiUrl = `${process.env.RENDERING_ENGINE_API || ""}`
6
+
7
+ export async function listen(sound: string): Promise<string> {
8
+ if (!sound?.length) {
9
+ console.log(`cannot call the API without a sound, aborting..`)
10
+ // throw new Error(`cannot call the API without a sound, aborting..`)
11
+ return ""
12
+ }
13
+
14
+ try {
15
+ const request = {
16
+ sound
17
+ } as SoundAnalysisRequest
18
+
19
+ console.log(`calling ${apiUrl}/listen called with: `, {
20
+ sound: request.sound.slice(0, 20)
21
+ })
22
+
23
+ const res = await fetch(`${apiUrl}/listen`, {
24
+ method: "POST",
25
+ headers: {
26
+ Accept: "application/json",
27
+ "Content-Type": "application/json",
28
+ // Authorization: `Bearer ${process.env.VC_SECRET_ACCESS_TOKEN}`,
29
+ },
30
+ body: JSON.stringify(request),
31
+ cache: 'no-store',
32
+ // we can also use this (see https://vercel.com/blog/vercel-cache-api-nextjs-cache)
33
+ // next: { revalidate: 1 }
34
+ })
35
+
36
+ if (res.status !== 200) {
37
+ throw new Error('Failed to fetch data')
38
+ }
39
+
40
+ const response = (await res.json()) as SoundAnalysisResponse
41
+ return response.result
42
+ } catch (err) {
43
+ console.error(err)
44
+ return ""
45
+ }
46
+ }
src/app/engine/think.ts CHANGED
@@ -5,35 +5,38 @@ import { createLlamaPrompt } from "@/lib/createLlamaPrompt"
5
 
6
  import { predict } from "./predict"
7
 
8
- export const think = async ({
9
- event = "",
10
- observation = "",
11
- history = "",
12
- }: {
13
- event: string;
14
- observation: string;
15
- history: string;
16
- }): Promise<string> => {
17
  if (!event) {
18
  throw new Error("missing event")
19
  }
 
 
 
 
 
 
 
 
 
 
20
  const prompt = createLlamaPrompt([
21
  {
22
  role: "system",
23
  content: [
24
- `You are a companion robot, very friendly, curious about the world.`,
25
-
26
- // TODO: put the history here (from most recent to oldest)
27
- `You have been presented some situation in the past, but you lost your memory.`,
28
-
29
  `Today's date is ${format(new Date(), 'yyyy-MM-dd at HH:mm (d)')}.`,
30
- , `You are currently observing this: ${observation}`,
 
 
 
 
31
  ].filter(item => item).join("\n")
32
  },
33
- {
34
- role: "user",
35
- content: event,
36
- }
37
  ])
38
 
39
 
@@ -56,5 +59,10 @@ export const think = async ({
56
  // llama-2 is too chatty, let's keep 3 sentences at most
57
  const sentences = sbd.sentences(result).slice(0, 3).join(" ").trim()
58
 
 
 
 
 
 
59
  return sentences
60
  }
 
5
 
6
  import { predict } from "./predict"
7
 
8
+ const internalHistory: {
9
+ role: string;
10
+ content: string;
11
+ }[] = []
12
+
13
+ export const think = async (event: string): Promise<string> => {
 
 
 
14
  if (!event) {
15
  throw new Error("missing event")
16
  }
17
+
18
+ internalHistory.push({
19
+ role: "user",
20
+ content: event,
21
+ })
22
+
23
+ if (internalHistory.length > 10) {
24
+ internalHistory.shift()
25
+ }
26
+
27
  const prompt = createLlamaPrompt([
28
  {
29
  role: "system",
30
  content: [
 
 
 
 
 
31
  `Today's date is ${format(new Date(), 'yyyy-MM-dd at HH:mm (d)')}.`,
32
+ `You are an android robot, very friendly, curious about the world.`,
33
+ `Your life goal is to help human and interact them as a natural way.`,
34
+ `You are going to see and hear various things, and you need to act in a very natural way.`,
35
+ `If you see someone through your eyes, you need to interact with them,`,
36
+ `you should be o ngoing and open, ask questions, be curious, do jokes etc.`,
37
  ].filter(item => item).join("\n")
38
  },
39
+ ...internalHistory,
 
 
 
40
  ])
41
 
42
 
 
59
  // llama-2 is too chatty, let's keep 3 sentences at most
60
  const sentences = sbd.sentences(result).slice(0, 3).join(" ").trim()
61
 
62
+ internalHistory.push({
63
+ role: "assistant",
64
+ content: sentences,
65
+ })
66
+
67
  return sentences
68
  }
src/app/listen.tsx CHANGED
@@ -2,52 +2,297 @@
2
 
3
  import { useCallback, useEffect, useRef, useState, useTransition } from "react"
4
  import { useInterval } from "usehooks-ts"
 
 
5
  import { useRecorder } from "react-microphone-recorder"
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  // import { listen } from "./engine/listen"
8
 
 
 
 
 
 
 
 
 
 
 
 
9
  export function Listen({
10
  onListen,
11
  }: {
12
  onListen: (recording: string) => void
13
  }) {
14
- const [_isPending, startTransition] = useTransition()
15
-
16
- const {
17
- audioLevel,
18
- startRecording,
19
- pauseRecording,
20
- stopRecording,
21
- resetRecording,
22
- audioURL,
23
- recordingState,
24
- isRecording,
25
- audioFile
26
- } = useRecorder()
27
-
28
- const status = audioLevel > 18 ? "I hear something!" : "background noise"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  useInterval(() => {
31
- console.log("let's stop, and start again")
32
- stopRecording()
33
- startRecording()
34
  }, 3000)
35
 
36
  useEffect(() => {
37
- console.log("starting recording..")
38
- startRecording()
 
 
 
 
 
39
 
40
- startTransition(async () => {
41
- // await listen()
42
- })
43
  }, [])
44
 
45
- return null
46
- /*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  return (
48
- <div className="fixed top-64 left-16 z-10 bg-gray-100 p-4">
49
- <div>{status}</div>
 
 
 
 
 
 
50
  </div>
51
  )
52
- */
53
  }
 
2
 
3
  import { useCallback, useEffect, useRef, useState, useTransition } from "react"
4
  import { useInterval } from "usehooks-ts"
5
+
6
+ // TODO: try this? https://www.npmjs.com/package/react-audio-voice-recorder
7
  import { useRecorder } from "react-microphone-recorder"
8
+ import { getWaveBlob } from "webm-to-wav-converter"
9
+ import {
10
+ AvailableModels,
11
+ InferenceSession,
12
+ MicRecorder,
13
+ SessionManager,
14
+ } from "whisper-turbo"
15
+
16
+ import { useToast } from "@/components/ui/use-toast"
17
+ // import { listen } from "@/app/engine/listen"
18
+ import { blobToBase64Uri } from "@/lib/blobToBase64Uri"
19
 
20
  // import { listen } from "./engine/listen"
21
 
22
+ export interface TSSegment {
23
+ text: string;
24
+ start: number;
25
+ stop: number;
26
+ last: boolean;
27
+ }
28
+
29
+ export interface TSTranscript {
30
+ segments: Array<TSSegment>;
31
+ }
32
+
33
  export function Listen({
34
  onListen,
35
  }: {
36
  onListen: (recording: string) => void
37
  }) {
38
+ const { toast } = useToast()
39
+
40
+ const [transcribing, setTranscribing] = useState(false)
41
+ const transcribingRef = useRef(transcribing)
42
+ useEffect(() => { transcribingRef.current = transcribing }, [transcribing])
43
+
44
+ // used to detect changes, signal when we can analyze the audio
45
+ const [audioDataFrame, setAudioDataFrame] = useState(0)
46
+ const audioDataFrameRef = useRef(audioDataFrame)
47
+ useEffect(() => { audioDataFrameRef.current = audioDataFrame }, [audioDataFrame])
48
+
49
+ const [transcriptBuffer, setTranscriptBuffer] = useState("")
50
+ useEffect(() => {
51
+ onListen(transcriptBuffer)
52
+ }, [transcriptBuffer])
53
+ /*
54
+ Available models: {
55
+ WHISPER_TINY: 'whisper-tiny',
56
+ WHISPER_BASE: 'whisper-base',
57
+ WHISPER_SMALL: 'whisper-small',
58
+ WHISPER_MEDIUM: 'whisper-medium',
59
+ WHISPER_LARGE: 'whisper-large'
60
+ }
61
+ */
62
+ const whisperModel: AvailableModels = AvailableModels.WHISPER_BASE
63
+
64
+ const listenerRef = useRef({
65
+ isListening: false,
66
+ startedListeningAt: 0,
67
+ stoppedListeningAt: 0,
68
+ durationInMs: 0,
69
+ hits: 0,
70
+ debugCanContinue: true, // used for debugging
71
+ })
72
+
73
+ // the background listener is not a CIA spy device, but a detect of changes in the
74
+ // background noise volume level. The goal is to detect whenever an interesting event is happening
75
+ const backgroundListener = useRecorder()
76
+
77
+ // the foreground listener is the actual sound sampler
78
+ // with out system, it will always lag a bit behind the background listener
79
+ // however there might be a fix (which I haven't tried yet):
80
+ // to take the last second of the background listener sample,
81
+ // and glue it to the beginning of the foreground listener sample
82
+ //
83
+ // or, alternatively, we could just try to use a shorter time window for the background listener,
84
+ // to make it more reactive
85
+ const foregroundListener = useRecorder()
86
+
87
+ // to detect voice, we use a combination of audio level and frequency sampling
88
+ const heardSomething = backgroundListener.audioLevel > 12 // 18
89
+
90
+ const status = heardSomething ? "I hear something!" : "background noise"
91
+
92
+ const session = useRef<InferenceSession | null>(null)
93
+ const [audioData, setAudioData] = useState<Uint8Array | null>(null)
94
+ const [audioMetadata, setAudioMetadata] = useState<File | null>(null)
95
+ const [loaded, setLoaded] = useState<boolean>(false)
96
+ const [progress, setProgress] = useState<number>(0)
97
+
98
+ const isLoadingModel = progress > 0
99
+ const hasLoadedModel = progress === 100
100
+
101
+ const loadModel = async () => {
102
+ console.log("loadModel")
103
+ if (session.current) {
104
+ session.current.destroy()
105
+ }
106
+ if (!whisperModel) {
107
+ console.error("No whisper model loaded")
108
+ return
109
+ }
110
+
111
+ try {
112
+ const manager = new SessionManager()
113
+ const loadResult = await manager.loadModel(
114
+ whisperModel,
115
+ () => {
116
+ setLoaded(true)
117
+ },
118
+ (p: number) => {
119
+ console.log("progress:", p)
120
+ setProgress(p)
121
+ }
122
+ )
123
+ if (loadResult.isErr) {
124
+ throw new Error(loadResult.error.message)
125
+ } else {
126
+ session.current = loadResult.value
127
+ }
128
+ } catch (err) {
129
+ const error = `failed to load the model: ${err}`
130
+ console.error(error)
131
+ toast({
132
+ title: "Error",
133
+ description: error,
134
+ variant: "destructive"
135
+ })
136
+ }
137
+ }
138
 
139
+ const runSession = async () => {
140
+ if (!loaded) {
141
+ console.log("runSession: aborting (model not loaded yet)")
142
+ return
143
+ }
144
+ if (!session.current) {
145
+ console.log("runSession: aborting (no model loaded)")
146
+ toast({
147
+ title: "Error",
148
+ description: "No model loaded",
149
+ variant: "destructive"
150
+ })
151
+ return
152
+ }
153
+ // console.log("debug:", { audioData, audioDataFrame })
154
+ if (!audioData) {
155
+ console.log("runSession: aborting (no audio file loaded)")
156
+ toast({
157
+ title: "Error",
158
+ description: "No audio file loaded",
159
+ variant: "destructive"
160
+ })
161
+ return
162
+ }
163
+
164
+ setTranscribing(transcribingRef.current = true)
165
+
166
+ try {
167
+ await session.current.transcribe(audioData, (s: any) => {
168
+ const segment = s as { text: string, start: number, stop: number, last: boolean }
169
+ const text = segment.text.trim()
170
+ console.log("text:", text)
171
+ if (text) {
172
+ setTranscriptBuffer(text)
173
+ }
174
+
175
+ if (s.last) {
176
+ console.log("IS LAST")
177
+ setTranscribing(transcribingRef.current = false)
178
+ return
179
+ }
180
+ })
181
+ } catch (err) {
182
+ const error = `transcription crashed: ${err}`
183
+ console.error(error)
184
+ toast({
185
+ title: "Error",
186
+ description: "No audio file loaded",
187
+ variant: "destructive"
188
+ })
189
+ }
190
+ }
191
+
192
+ // let's disable the background recorder for now
193
  useInterval(() => {
194
+ // console.log("let's stop, and start again")
195
+ backgroundListener.stopRecording()
196
+ backgroundListener.startRecording()
197
  }, 3000)
198
 
199
  useEffect(() => {
200
+ const fn = async () => {
201
+ console.log("load model..")
202
+ await loadModel()
203
+
204
+ console.log("starting to listen to background noise to detect volume peaks..")
205
+ backgroundListener.startRecording()
206
+ }
207
 
208
+ fn()
 
 
209
  }, [])
210
 
211
+
212
+ useEffect(() => {
213
+ if (!audioData) {
214
+ console.log("no audio")
215
+ }
216
+ // console.log("audioDataFrame changed, need to process audioData!")
217
+ runSession()
218
+ }, [audioDataFrame])
219
+
220
+ useEffect(() => {
221
+ if (heardSomething) {
222
+ if (!listenerRef.current.isListening) {
223
+ console.log("recoording..")
224
+ foregroundListener.startRecording()
225
+ listenerRef.current.hits = 0
226
+ listenerRef.current.isListening = true
227
+
228
+ // TODO: use a debouncer to detect when we started speaking
229
+ setTimeout(async () => {
230
+ foregroundListener.stopRecording()
231
+ listenerRef.current.isListening = false
232
+ listenerRef.current.stoppedListeningAt = Date.now()
233
+ listenerRef.current.durationInMs =
234
+ listenerRef.current.stoppedListeningAt - listenerRef.current.startedListeningAt
235
+
236
+ const hits = listenerRef.current.hits
237
+
238
+ console.log(`end of sample (${foregroundListener.timeElapsed}, ${hits} hits)`)
239
+
240
+ if (!foregroundListener.audioBlob || typeof window === "undefined" || !window?.FileReader) {
241
+ return
242
+ }
243
+
244
+ if (hits > 11) {
245
+ // at 12 threshold level, we should have between 12 and 20 hits (per 2 sec) for short words and utterances
246
+ // at 12 threshold level, keystrokes should not be detected, unless the person hits the keyboard heavily
247
+
248
+ console.log("got an interesting sample, sending for review")
249
+
250
+ // temporary, to prevent infinite loop
251
+ if (listenerRef.current.debugCanContinue) {
252
+ // to prevent the infinite loop, set this value to false
253
+ // listenerRef.current.debugCanContinue = false
254
+
255
+ try {
256
+ const blob = await getWaveBlob(foregroundListener.audioBlob, false) // false = 16 bit, true = 32 bit
257
+ const arrayBuffer = await blob.arrayBuffer()
258
+ const uint8Array = new Uint8Array(arrayBuffer)
259
+
260
+ setAudioData(uint8Array)
261
+ setAudioDataFrame(audioDataFrameRef.current + 1)
262
+ } catch (err) {
263
+ const error = `failed to convert the audio sample: ${err}`
264
+ console.error(error)
265
+ toast({
266
+ title: "Error",
267
+ description: error,
268
+ variant: "destructive"
269
+ })
270
+ }
271
+ } else {
272
+ console.log("Julian: infinite loop temporary disabled :D")
273
+ }
274
+ }
275
+ }, 3000)
276
+ } else {
277
+ // TODO: increase hits?
278
+ // listenerRef.current.hits = listenerRef.current.hits + 1
279
+ }
280
+ }
281
+ }, [heardSomething])
282
+
283
+ if (heardSomething && listenerRef.current.isListening) {
284
+ listenerRef.current.hits = listenerRef.current.hits + 1
285
+ }
286
+
287
  return (
288
+ <div className="fixed top-80 left-16 z-10 bg-gray-100 p-4">
289
+ {isLoadingModel && hasLoadedModel
290
+ ? <p>Loading: ${progress}%</p>
291
+ : <p>{
292
+ transcriptBuffer
293
+ || ""
294
+ }</p>
295
+ }
296
  </div>
297
  )
 
298
  }
src/app/main.tsx CHANGED
@@ -10,6 +10,7 @@ import { think } from "./engine/think"
10
  import { Progress } from "./interface/progress"
11
  import { Listen } from "./listen"
12
  import { Speak } from "./speak"
 
13
 
14
  export default function Main() {
15
  const [_isPending, startTransition] = useTransition()
@@ -17,38 +18,25 @@ export default function Main() {
17
  const [lastRawObservation, setLastRawObservation] = useState<string>("")
18
  const [isLoadingAction, setLoadingAction] = useState(false)
19
 
20
- const [observations, setObservations] = useState<string[]>([])
21
  const [action, setAction] = useState<string>("Nothing to say yet.")
22
 
23
- // receive a new observation from what the agent is looking at
24
- const handleOnObserve = (observation: string, image: string) => {
25
- setLastRawObservation(observation)
26
- setLastImage(image)
27
-
28
- // last comes first
29
- setObservations([
30
- `On ${format(new Date(), 'yyyy-MM-dd at HH:mm (d)')}, you saw: \"${observation}\".`
31
- ].concat(observations))
32
-
33
- // TODO: use llama-2 to summarize previous observations
34
- const history = observations.slice(0, 3).join("\n")
35
-
36
-
37
  startTransition(async () => {
38
- setLoadingAction(true)
39
- const action = await think({
40
- history,
41
- observation,
42
- event: "Please react in a natural way to the current situation, by interacting with the person or entity you are seeing.",
43
- })
44
-
45
  setAction(action)
46
  setLoadingAction(false)
47
  })
48
  }
 
 
 
 
 
 
49
 
50
  const handleOnListen = (recording: string) => {
51
- console.log("on listen")
52
  }
53
 
54
  return (
@@ -93,8 +81,9 @@ export default function Main() {
93
  </div>
94
 
95
  <Observe onObserve={handleOnObserve} />
96
- {/*<Listen onListen={handleOnListen} />*/}
97
  <Speak>{action}</Speak>
 
98
 
99
  <Progress
100
  isLoading={isLoadingAction}
@@ -104,11 +93,12 @@ export default function Main() {
104
 
105
  <div className="fixed z-10 left-0 right-0 bottom-0 flex flex-col items-center justify-center">
106
  <div className="full md:w-[80%] lg:w-[70%] mb-0 md:p-4 md:mb-8 bg-zinc-100 md:rounded-xl p-4 shadow-2xl text-xs md:text-sm">
107
- <p>🅿️ <span className="font-semibold">Informations: </span> This demo uses
108
- <a href="https://huggingface.co/HuggingFaceM4/idefics-80b#bias-evaluation" target="_blank" className="font-semibold"> IDEFICS </a>
109
- and
110
- <a href="https://huggingface.co/meta-llama" target="_blank" className="font-semibold"> Llama-2 </a>, and is provided for demonstration and research purposes.</p>
111
- <p>⛔️ <span className="font-semibold">Limitations: </span> This demo is provided as-is, with no guarantee of factually correct results. In some cases, the models may return hallucinated or innapropriate responses.</p>
 
112
  </div>
113
  </div>
114
  </div>
 
10
  import { Progress } from "./interface/progress"
11
  import { Listen } from "./listen"
12
  import { Speak } from "./speak"
13
+ import { Toaster } from "@/components/ui/toaster"
14
 
15
  export default function Main() {
16
  const [_isPending, startTransition] = useTransition()
 
18
  const [lastRawObservation, setLastRawObservation] = useState<string>("")
19
  const [isLoadingAction, setLoadingAction] = useState(false)
20
 
 
21
  const [action, setAction] = useState<string>("Nothing to say yet.")
22
 
23
+ const handleOnEvent = (event: string) => {
24
+ setLoadingAction(true)
 
 
 
 
 
 
 
 
 
 
 
 
25
  startTransition(async () => {
26
+ const action = await think(event)
 
 
 
 
 
 
27
  setAction(action)
28
  setLoadingAction(false)
29
  })
30
  }
31
+ // receive a new observation from what the agent is looking at
32
+ const handleOnObserve = (observation: string, image: string) => {
33
+ setLastRawObservation(observation)
34
+ setLastImage(image)
35
+ handleOnEvent(`It is ${format(new Date(), 'HH:mm (d)')} and you are seeing this: ${observation}`)
36
+ }
37
 
38
  const handleOnListen = (recording: string) => {
39
+ handleOnEvent(`It is ${format(new Date(), 'HH:mm (d)')} and you are hearing this: ${recording}`)
40
  }
41
 
42
  return (
 
81
  </div>
82
 
83
  <Observe onObserve={handleOnObserve} />
84
+ <Listen onListen={handleOnListen} />
85
  <Speak>{action}</Speak>
86
+ <Toaster />
87
 
88
  <Progress
89
  isLoading={isLoadingAction}
 
93
 
94
  <div className="fixed z-10 left-0 right-0 bottom-0 flex flex-col items-center justify-center">
95
  <div className="full md:w-[80%] lg:w-[70%] mb-0 md:p-4 md:mb-8 bg-zinc-100 md:rounded-xl p-4 shadow-2xl text-xs md:text-sm">
96
+ <p>🅿️ <span className="font-semibold">
97
+ </span>This multimodal demo allow
98
+ <a href="https://huggingface.co/meta-llama" target="_blank" className="font-semibold underline"> Llama-2 </a> to hear, see and talk.
99
+ You need to upgrade to a <a href="https://caniuse.com/webgpu" target="_blank" className="font-semibold underline">browser with support for WebGPU</a> for speech recognition to work.
100
+ Vision is handled by <a href="https://huggingface.co/HuggingFaceM4/idefics-80b#bias-evaluation" target="_blank" className="font-semibold underline"> IDEFICS </a></p>
101
+ <p>⛔️ <span className="font-semibold">Limitations: </span>This demo is provided as-is, for demonstration and research purpose only. As it demonstrates WebGPU technology, this demo will not support incompatible browsers and/or devices. No guarantee of factually correct results. In some cases, the models may return hallucinated or innapropriate responses.</p>
102
  </div>
103
  </div>
104
  </div>
src/app/observe.tsx CHANGED
@@ -66,7 +66,7 @@ export function Observe({
66
 
67
  setBusy(true)
68
 
69
- console.log("Capturing new frame from webcam..")
70
 
71
  startTransition(async () => {
72
  const imageBase64 = capture()
@@ -80,10 +80,10 @@ export function Observe({
80
  }
81
  const prompt = `What do you see here?`
82
 
83
- console.log("Calling IDEFICS..")
84
- const newObservation = await see({ prompt, imageBase64 })
85
 
86
- console.log("New observation: ", newObservation)
87
  if (newObservation !== lastObservation) {
88
  // console.log("update!")
89
  setLastObservation(newObservation || "")
 
66
 
67
  setBusy(true)
68
 
69
+ // console.log("Capturing new frame from webcam..")
70
 
71
  startTransition(async () => {
72
  const imageBase64 = capture()
 
80
  }
81
  const prompt = `What do you see here?`
82
 
83
+ // console.log("Calling IDEFICS..")
84
+ const newObservation = "fake" // await see({ prompt, imageBase64 })
85
 
86
+ // console.log("New observation: ", newObservation)
87
  if (newObservation !== lastObservation) {
88
  // console.log("update!")
89
  setLastObservation(newObservation || "")
src/app/speak.tsx CHANGED
@@ -46,11 +46,14 @@ export function Speak({
46
  if (newMessage === playedMessage) { return }
47
  const synth = window.speechSynthesis
48
 
49
- console.log(`Speaking "${newMessage}"`)
50
  setPlayedMessage(newMessage)
51
  const utterance = new SpeechSynthesisUtterance(newMessage)
52
  utterance.voice = voice
53
- synth.speak(utterance)
 
 
 
54
  }, [voice?.name, newMessage, playedMessage])
55
 
56
  return (
 
46
  if (newMessage === playedMessage) { return }
47
  const synth = window.speechSynthesis
48
 
49
+ // console.log(`Speaking "${newMessage}"`)
50
  setPlayedMessage(newMessage)
51
  const utterance = new SpeechSynthesisUtterance(newMessage)
52
  utterance.voice = voice
53
+
54
+ console.log("julian: voice disabled :D")
55
+ // synth.speak(utterance)
56
+
57
  }, [voice?.name, newMessage, playedMessage])
58
 
59
  return (
src/components/ui/dialog.tsx CHANGED
@@ -11,10 +11,9 @@ const Dialog = DialogPrimitive.Root
11
  const DialogTrigger = DialogPrimitive.Trigger
12
 
13
  const DialogPortal = ({
14
- className,
15
  ...props
16
  }: DialogPrimitive.DialogPortalProps) => (
17
- <DialogPrimitive.Portal className={cn(className)} {...props} />
18
  )
19
  DialogPortal.displayName = DialogPrimitive.Portal.displayName
20
 
 
11
  const DialogTrigger = DialogPrimitive.Trigger
12
 
13
  const DialogPortal = ({
 
14
  ...props
15
  }: DialogPrimitive.DialogPortalProps) => (
16
+ <DialogPrimitive.Portal {...props} />
17
  )
18
  DialogPortal.displayName = DialogPrimitive.Portal.displayName
19
 
src/components/ui/toast.tsx ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import * as React from "react"
2
+ import * as ToastPrimitives from "@radix-ui/react-toast"
3
+ import { cva, type VariantProps } from "class-variance-authority"
4
+ import { X } from "lucide-react"
5
+
6
+ import { cn } from "@/lib/utils"
7
+
8
+ const ToastProvider = ToastPrimitives.Provider
9
+
10
+ const ToastViewport = React.forwardRef<
11
+ React.ElementRef<typeof ToastPrimitives.Viewport>,
12
+ React.ComponentPropsWithoutRef<typeof ToastPrimitives.Viewport>
13
+ >(({ className, ...props }, ref) => (
14
+ <ToastPrimitives.Viewport
15
+ ref={ref}
16
+ className={cn(
17
+ "fixed top-0 z-[100] flex max-h-screen w-full flex-col-reverse p-4 sm:bottom-0 sm:right-0 sm:top-auto sm:flex-col md:max-w-[420px]",
18
+ className
19
+ )}
20
+ {...props}
21
+ />
22
+ ))
23
+ ToastViewport.displayName = ToastPrimitives.Viewport.displayName
24
+
25
+ const toastVariants = cva(
26
+ "group pointer-events-auto relative flex w-full items-center justify-between space-x-4 overflow-hidden rounded-md border border-stone-200 p-6 pr-8 shadow-lg transition-all data-[swipe=cancel]:translate-x-0 data-[swipe=end]:translate-x-[var(--radix-toast-swipe-end-x)] data-[swipe=move]:translate-x-[var(--radix-toast-swipe-move-x)] data-[swipe=move]:transition-none data-[state=open]:animate-in data-[state=closed]:animate-out data-[swipe=end]:animate-out data-[state=closed]:fade-out-80 data-[state=closed]:slide-out-to-right-full data-[state=open]:slide-in-from-top-full data-[state=open]:sm:slide-in-from-bottom-full dark:border-stone-800",
27
+ {
28
+ variants: {
29
+ variant: {
30
+ default: "border bg-white text-stone-950 dark:bg-stone-950 dark:text-stone-50",
31
+ destructive:
32
+ "destructive group border-red-500 bg-red-500 text-stone-50 dark:border-red-900 dark:bg-red-900 dark:text-stone-50",
33
+ },
34
+ },
35
+ defaultVariants: {
36
+ variant: "default",
37
+ },
38
+ }
39
+ )
40
+
41
+ const Toast = React.forwardRef<
42
+ React.ElementRef<typeof ToastPrimitives.Root>,
43
+ React.ComponentPropsWithoutRef<typeof ToastPrimitives.Root> &
44
+ VariantProps<typeof toastVariants>
45
+ >(({ className, variant, ...props }, ref) => {
46
+ return (
47
+ <ToastPrimitives.Root
48
+ ref={ref}
49
+ className={cn(toastVariants({ variant }), className)}
50
+ {...props}
51
+ />
52
+ )
53
+ })
54
+ Toast.displayName = ToastPrimitives.Root.displayName
55
+
56
+ const ToastAction = React.forwardRef<
57
+ React.ElementRef<typeof ToastPrimitives.Action>,
58
+ React.ComponentPropsWithoutRef<typeof ToastPrimitives.Action>
59
+ >(({ className, ...props }, ref) => (
60
+ <ToastPrimitives.Action
61
+ ref={ref}
62
+ className={cn(
63
+ "inline-flex h-8 shrink-0 items-center justify-center rounded-md border border-stone-200 bg-transparent px-3 text-sm font-medium ring-offset-white transition-colors hover:bg-stone-100 focus:outline-none focus:ring-2 focus:ring-stone-950 focus:ring-offset-2 disabled:pointer-events-none disabled:opacity-50 group-[.destructive]:border-stone-100/40 group-[.destructive]:hover:border-red-500/30 group-[.destructive]:hover:bg-red-500 group-[.destructive]:hover:text-stone-50 group-[.destructive]:focus:ring-red-500 dark:border-stone-800 dark:ring-offset-stone-950 dark:hover:bg-stone-800 dark:focus:ring-stone-300 dark:group-[.destructive]:border-stone-800/40 dark:group-[.destructive]:hover:border-red-900/30 dark:group-[.destructive]:hover:bg-red-900 dark:group-[.destructive]:hover:text-stone-50 dark:group-[.destructive]:focus:ring-red-900",
64
+ className
65
+ )}
66
+ {...props}
67
+ />
68
+ ))
69
+ ToastAction.displayName = ToastPrimitives.Action.displayName
70
+
71
+ const ToastClose = React.forwardRef<
72
+ React.ElementRef<typeof ToastPrimitives.Close>,
73
+ React.ComponentPropsWithoutRef<typeof ToastPrimitives.Close>
74
+ >(({ className, ...props }, ref) => (
75
+ <ToastPrimitives.Close
76
+ ref={ref}
77
+ className={cn(
78
+ "absolute right-2 top-2 rounded-md p-1 text-stone-950/50 opacity-0 transition-opacity hover:text-stone-950 focus:opacity-100 focus:outline-none focus:ring-2 group-hover:opacity-100 group-[.destructive]:text-red-300 group-[.destructive]:hover:text-red-50 group-[.destructive]:focus:ring-red-400 group-[.destructive]:focus:ring-offset-red-600 dark:text-stone-50/50 dark:hover:text-stone-50",
79
+ className
80
+ )}
81
+ toast-close=""
82
+ {...props}
83
+ >
84
+ <X className="h-4 w-4" />
85
+ </ToastPrimitives.Close>
86
+ ))
87
+ ToastClose.displayName = ToastPrimitives.Close.displayName
88
+
89
+ const ToastTitle = React.forwardRef<
90
+ React.ElementRef<typeof ToastPrimitives.Title>,
91
+ React.ComponentPropsWithoutRef<typeof ToastPrimitives.Title>
92
+ >(({ className, ...props }, ref) => (
93
+ <ToastPrimitives.Title
94
+ ref={ref}
95
+ className={cn("text-sm font-semibold", className)}
96
+ {...props}
97
+ />
98
+ ))
99
+ ToastTitle.displayName = ToastPrimitives.Title.displayName
100
+
101
+ const ToastDescription = React.forwardRef<
102
+ React.ElementRef<typeof ToastPrimitives.Description>,
103
+ React.ComponentPropsWithoutRef<typeof ToastPrimitives.Description>
104
+ >(({ className, ...props }, ref) => (
105
+ <ToastPrimitives.Description
106
+ ref={ref}
107
+ className={cn("text-sm opacity-90", className)}
108
+ {...props}
109
+ />
110
+ ))
111
+ ToastDescription.displayName = ToastPrimitives.Description.displayName
112
+
113
+ type ToastProps = React.ComponentPropsWithoutRef<typeof Toast>
114
+
115
+ type ToastActionElement = React.ReactElement<typeof ToastAction>
116
+
117
+ export {
118
+ type ToastProps,
119
+ type ToastActionElement,
120
+ ToastProvider,
121
+ ToastViewport,
122
+ Toast,
123
+ ToastTitle,
124
+ ToastDescription,
125
+ ToastClose,
126
+ ToastAction,
127
+ }
src/components/ui/toaster.tsx ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "use client"
2
+
3
+ import {
4
+ Toast,
5
+ ToastClose,
6
+ ToastDescription,
7
+ ToastProvider,
8
+ ToastTitle,
9
+ ToastViewport,
10
+ } from "@/components/ui/toast"
11
+ import { useToast } from "@/components/ui/use-toast"
12
+
13
+ export function Toaster() {
14
+ const { toasts } = useToast()
15
+
16
+ return (
17
+ <ToastProvider>
18
+ {toasts.map(function ({ id, title, description, action, ...props }) {
19
+ return (
20
+ <Toast key={id} {...props}>
21
+ <div className="grid gap-1">
22
+ {title && <ToastTitle>{title}</ToastTitle>}
23
+ {description && (
24
+ <ToastDescription>{description}</ToastDescription>
25
+ )}
26
+ </div>
27
+ {action}
28
+ <ToastClose />
29
+ </Toast>
30
+ )
31
+ })}
32
+ <ToastViewport />
33
+ </ToastProvider>
34
+ )
35
+ }
src/components/ui/use-toast.ts ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Inspired by react-hot-toast library
2
+ import * as React from "react"
3
+
4
+ import type {
5
+ ToastActionElement,
6
+ ToastProps,
7
+ } from "@/components/ui/toast"
8
+
9
+ const TOAST_LIMIT = 1
10
+ const TOAST_REMOVE_DELAY = 1000000
11
+
12
+ type ToasterToast = ToastProps & {
13
+ id: string
14
+ title?: React.ReactNode
15
+ description?: React.ReactNode
16
+ action?: ToastActionElement
17
+ }
18
+
19
+ const actionTypes = {
20
+ ADD_TOAST: "ADD_TOAST",
21
+ UPDATE_TOAST: "UPDATE_TOAST",
22
+ DISMISS_TOAST: "DISMISS_TOAST",
23
+ REMOVE_TOAST: "REMOVE_TOAST",
24
+ } as const
25
+
26
+ let count = 0
27
+
28
+ function genId() {
29
+ count = (count + 1) % Number.MAX_VALUE
30
+ return count.toString()
31
+ }
32
+
33
+ type ActionType = typeof actionTypes
34
+
35
+ type Action =
36
+ | {
37
+ type: ActionType["ADD_TOAST"]
38
+ toast: ToasterToast
39
+ }
40
+ | {
41
+ type: ActionType["UPDATE_TOAST"]
42
+ toast: Partial<ToasterToast>
43
+ }
44
+ | {
45
+ type: ActionType["DISMISS_TOAST"]
46
+ toastId?: ToasterToast["id"]
47
+ }
48
+ | {
49
+ type: ActionType["REMOVE_TOAST"]
50
+ toastId?: ToasterToast["id"]
51
+ }
52
+
53
+ interface State {
54
+ toasts: ToasterToast[]
55
+ }
56
+
57
+ const toastTimeouts = new Map<string, ReturnType<typeof setTimeout>>()
58
+
59
+ const addToRemoveQueue = (toastId: string) => {
60
+ if (toastTimeouts.has(toastId)) {
61
+ return
62
+ }
63
+
64
+ const timeout = setTimeout(() => {
65
+ toastTimeouts.delete(toastId)
66
+ dispatch({
67
+ type: "REMOVE_TOAST",
68
+ toastId: toastId,
69
+ })
70
+ }, TOAST_REMOVE_DELAY)
71
+
72
+ toastTimeouts.set(toastId, timeout)
73
+ }
74
+
75
+ export const reducer = (state: State, action: Action): State => {
76
+ switch (action.type) {
77
+ case "ADD_TOAST":
78
+ return {
79
+ ...state,
80
+ toasts: [action.toast, ...state.toasts].slice(0, TOAST_LIMIT),
81
+ }
82
+
83
+ case "UPDATE_TOAST":
84
+ return {
85
+ ...state,
86
+ toasts: state.toasts.map((t) =>
87
+ t.id === action.toast.id ? { ...t, ...action.toast } : t
88
+ ),
89
+ }
90
+
91
+ case "DISMISS_TOAST": {
92
+ const { toastId } = action
93
+
94
+ // ! Side effects ! - This could be extracted into a dismissToast() action,
95
+ // but I'll keep it here for simplicity
96
+ if (toastId) {
97
+ addToRemoveQueue(toastId)
98
+ } else {
99
+ state.toasts.forEach((toast) => {
100
+ addToRemoveQueue(toast.id)
101
+ })
102
+ }
103
+
104
+ return {
105
+ ...state,
106
+ toasts: state.toasts.map((t) =>
107
+ t.id === toastId || toastId === undefined
108
+ ? {
109
+ ...t,
110
+ open: false,
111
+ }
112
+ : t
113
+ ),
114
+ }
115
+ }
116
+ case "REMOVE_TOAST":
117
+ if (action.toastId === undefined) {
118
+ return {
119
+ ...state,
120
+ toasts: [],
121
+ }
122
+ }
123
+ return {
124
+ ...state,
125
+ toasts: state.toasts.filter((t) => t.id !== action.toastId),
126
+ }
127
+ }
128
+ }
129
+
130
+ const listeners: Array<(state: State) => void> = []
131
+
132
+ let memoryState: State = { toasts: [] }
133
+
134
+ function dispatch(action: Action) {
135
+ memoryState = reducer(memoryState, action)
136
+ listeners.forEach((listener) => {
137
+ listener(memoryState)
138
+ })
139
+ }
140
+
141
+ type Toast = Omit<ToasterToast, "id">
142
+
143
+ function toast({ ...props }: Toast) {
144
+ const id = genId()
145
+
146
+ const update = (props: ToasterToast) =>
147
+ dispatch({
148
+ type: "UPDATE_TOAST",
149
+ toast: { ...props, id },
150
+ })
151
+ const dismiss = () => dispatch({ type: "DISMISS_TOAST", toastId: id })
152
+
153
+ dispatch({
154
+ type: "ADD_TOAST",
155
+ toast: {
156
+ ...props,
157
+ id,
158
+ open: true,
159
+ onOpenChange: (open) => {
160
+ if (!open) dismiss()
161
+ },
162
+ },
163
+ })
164
+
165
+ return {
166
+ id: id,
167
+ dismiss,
168
+ update,
169
+ }
170
+ }
171
+
172
+ function useToast() {
173
+ const [state, setState] = React.useState<State>(memoryState)
174
+
175
+ React.useEffect(() => {
176
+ listeners.push(setState)
177
+ return () => {
178
+ const index = listeners.indexOf(setState)
179
+ if (index > -1) {
180
+ listeners.splice(index, 1)
181
+ }
182
+ }
183
+ }, [state])
184
+
185
+ return {
186
+ ...state,
187
+ toast,
188
+ dismiss: (toastId?: string) => dispatch({ type: "DISMISS_TOAST", toastId }),
189
+ }
190
+ }
191
+
192
+ export { useToast, toast }
src/lib/blobToBase64Uri.ts ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export function blobToBase64Uri(blob?: Blob): Promise<string> {
2
+ return new Promise((resolve, reject) => {
3
+ if (!blob || typeof window === "undefined" || !window.FileReader) {
4
+ resolve("")
5
+ return
6
+ }
7
+
8
+ const reader = new window.FileReader()
9
+ reader.readAsDataURL(blob)
10
+ reader.onloadend = () => {
11
+ resolve(`${reader.result || ""}`)
12
+ }
13
+ reader.onerror = () => {
14
+ // reject("error while converting blob to base64")
15
+ resolve("")
16
+ }
17
+ })
18
+ }
src/types.ts CHANGED
@@ -7,3 +7,14 @@ export interface ImageAnalysisResponse {
7
  result: string
8
  error?: string
9
  }
 
 
 
 
 
 
 
 
 
 
 
 
7
  result: string
8
  error?: string
9
  }
10
+
11
+
12
+ export interface SoundAnalysisRequest {
13
+ sound: string // in base64
14
+ prompt: string
15
+ }
16
+
17
+ export interface SoundAnalysisResponse {
18
+ result: string
19
+ error?: string
20
+ }