r3Vibe commited on
Commit
e867911
·
1 Parent(s): d3d2211
app/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (152 Bytes). View file
 
app/__pycache__/main.cpython-310.pyc ADDED
Binary file (1.61 kB). View file
 
app/__pycache__/matcher.cpython-310.pyc ADDED
Binary file (807 Bytes). View file
 
app/__pycache__/mfcc.cpython-310.pyc ADDED
Binary file (1.63 kB). View file
 
app/__pycache__/string_processor.cpython-310.pyc ADDED
Binary file (657 Bytes). View file
 
app/__pycache__/transcriber.cpython-310.pyc ADDED
Binary file (1.16 kB). View file
 
app/main.py CHANGED
@@ -15,6 +15,8 @@ app = FastAPI(
15
  {
16
  "url": "http://127.0.0.1:8000/api/v1",
17
  "description": "Local Server",
 
 
18
  "url": "https://r3vibe-mother-tongue.hf.space/api/v1",
19
  "description": "Huggingface Server",
20
  }
 
15
  {
16
  "url": "http://127.0.0.1:8000/api/v1",
17
  "description": "Local Server",
18
+ },
19
+ {
20
  "url": "https://r3vibe-mother-tongue.hf.space/api/v1",
21
  "description": "Huggingface Server",
22
  }
app/matcher.py CHANGED
@@ -18,8 +18,6 @@ def sequence_match(a, b):
18
  return difflib.SequenceMatcher(None, a, b).ratio()
19
 
20
 
21
-
22
-
23
  def match(original, transcription):
24
  sequence = sequence_match(original, transcription)
25
  phonetic = phonetic_match(original, transcription)
 
18
  return difflib.SequenceMatcher(None, a, b).ratio()
19
 
20
 
 
 
21
  def match(original, transcription):
22
  sequence = sequence_match(original, transcription)
23
  phonetic = phonetic_match(original, transcription)
app/routers/V1/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (163 Bytes). View file
 
app/routers/V1/__pycache__/v1_routers.cpython-310.pyc ADDED
Binary file (411 Bytes). View file
 
app/routers/V1/voice/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (169 Bytes). View file
 
app/routers/V1/voice/__pycache__/voice_router.cpython-310.pyc ADDED
Binary file (1.94 kB). View file
 
app/routers/V1/voice/voice_router.py CHANGED
@@ -6,7 +6,7 @@ import os
6
  from app.transcriber import get_transcription
7
  from app.matcher import match
8
  from app.mfcc import mfcc_similarty_check
9
- from app.string_processor import process_text
10
 
11
 
12
  """ initialize the router """
@@ -51,7 +51,8 @@ async def transcribe_audio(
51
 
52
  try:
53
  text = get_transcription(filename_recorded)
54
- sequence, phonetic = match(matcher_text, process_text(text))
 
55
  Euclidean, Cosine = mfcc_similarty_check(
56
  filename_original, filename_recorded
57
  )
 
6
  from app.transcriber import get_transcription
7
  from app.matcher import match
8
  from app.mfcc import mfcc_similarty_check
9
+ from app.string_processor import clean_transcription
10
 
11
 
12
  """ initialize the router """
 
51
 
52
  try:
53
  text = get_transcription(filename_recorded)
54
+ text = clean_transcription(text)
55
+ sequence, phonetic = match(matcher_text, text)
56
  Euclidean, Cosine = mfcc_similarty_check(
57
  filename_original, filename_recorded
58
  )
app/routers/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (160 Bytes). View file
 
app/routers/__pycache__/routes.cpython-310.pyc ADDED
Binary file (378 Bytes). View file
 
app/string_processor.py CHANGED
@@ -1,18 +1,22 @@
1
- import string
2
  import re
3
 
4
 
5
- def process_text(text):
6
- # Step 1: Strip whitespace from both ends
7
- text = text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- # Step 2: Remove all punctuation (including full stops and commas)
10
- text = text.translate(str.maketrans("", "", string.punctuation))
11
-
12
- # Step 3: Extract sentences (assuming you want to keep the text as a whole sentence)
13
- sentences = re.split(r"(?<=[.!?]) +", text)
14
-
15
- # Combine the sentences back into a single string without punctuation
16
- processed_text = " ".join(sentences)
17
-
18
- return processed_text
 
1
+ import unicodedata
2
  import re
3
 
4
 
5
+ def clean_transcription(text):
6
+ # Normalize the text to NFKD form
7
+ normalized_text = unicodedata.normalize('NFKD', text)
8
+
9
+ # Remove diacritics
10
+ cleaned_text = ''.join([c for c in normalized_text if not unicodedata.combining(c)])
11
+
12
+ # Explicitly remove the leading ʻ character and any other specific characters
13
+ cleaned_text = cleaned_text.replace('ʻ', '')
14
+
15
+ # Remove any remaining special characters (if any)
16
+ cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)
17
+
18
+ # Ensure the text is stripped of any unwanted leading or trailing whitespace
19
+ cleaned_text = cleaned_text.strip()
20
+
21
+ return cleaned_text
22