Spaces:
Running
Running
p
commited on
Commit
·
ba0fb36
1
Parent(s):
7b6aa43
enable some langs supported by num2words
Browse files- app.py +9 -5
- num2words_lang_map.json +29 -0
app.py
CHANGED
@@ -47,8 +47,12 @@ lang_codes = {key + " (" + lang_codes[key] + ")": lang_codes[key] for key in lan
|
|
47 |
# Extract language names
|
48 |
language_names = list(lang_codes.keys())
|
49 |
|
|
|
|
|
|
|
50 |
|
51 |
-
|
|
|
52 |
# Find all numbers in the text using regex
|
53 |
numbers = re.findall(r"\d+", text)
|
54 |
# Sort numbers in descending order of length
|
@@ -57,7 +61,7 @@ def convert_eng_numbers_to_words(text):
|
|
57 |
|
58 |
# Replace numbers with their word equivalents
|
59 |
for number in sorted_numbers:
|
60 |
-
number_word = num2words(int(number))
|
61 |
text = text.replace(number, number_word)
|
62 |
|
63 |
return text
|
@@ -82,9 +86,9 @@ def prepare_sentences(text, lang="mya"):
|
|
82 |
text = convert_mya_numbers_to_words(text)
|
83 |
text = text.replace("\u104A", ",").replace("\u104B", ".")
|
84 |
|
85 |
-
if lang
|
86 |
-
|
87 |
-
|
88 |
print("Processed text", text)
|
89 |
|
90 |
paragraphs = [paragraph for paragraph in text.split("\n") if paragraph.strip()]
|
|
|
47 |
# Extract language names
|
48 |
language_names = list(lang_codes.keys())
|
49 |
|
50 |
+
# Load num2words_lang_map
|
51 |
+
with open("num2words_lang_map.json") as f:
|
52 |
+
num2words_lang_map = json.load(f, object_pairs_hook=OrderedDict)
|
53 |
|
54 |
+
|
55 |
+
def convert_numbers_to_words_num2words(text, lang):
|
56 |
# Find all numbers in the text using regex
|
57 |
numbers = re.findall(r"\d+", text)
|
58 |
# Sort numbers in descending order of length
|
|
|
61 |
|
62 |
# Replace numbers with their word equivalents
|
63 |
for number in sorted_numbers:
|
64 |
+
number_word = num2words(int(number), lang=num2words_lang_map[lang][0])
|
65 |
text = text.replace(number, number_word)
|
66 |
|
67 |
return text
|
|
|
86 |
text = convert_mya_numbers_to_words(text)
|
87 |
text = text.replace("\u104A", ",").replace("\u104B", ".")
|
88 |
|
89 |
+
if lang in num2words_lang_map:
|
90 |
+
print("num2words supports this lang", lang)
|
91 |
+
text = convert_numbers_to_words_num2words(text, lang)
|
92 |
print("Processed text", text)
|
93 |
|
94 |
paragraphs = [paragraph for paragraph in text.split("\n") if paragraph.strip()]
|
num2words_lang_map.json
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eng": ["en", "English, default"],
|
3 |
+
"amh": ["am", "Amharic"],
|
4 |
+
"ara": ["ar", "Arabic"],
|
5 |
+
"deu": ["de", "German"],
|
6 |
+
"spa": ["es", "Spanish"],
|
7 |
+
"fas": ["fa", "Farsi"],
|
8 |
+
"fin": ["fi", "Finnish"],
|
9 |
+
"fra": ["fr", "French"],
|
10 |
+
"heb": ["he", "Hebrew"],
|
11 |
+
"hun": ["hu", "Hungarian"],
|
12 |
+
"ind": ["id", "Indonesian"],
|
13 |
+
"isl": ["is", "Icelandic"],
|
14 |
+
"kan": ["kn", "Kannada"],
|
15 |
+
"kor": ["ko", "Korean"],
|
16 |
+
"kaz": ["kz", "Kazakh"],
|
17 |
+
"lav": ["lv", "Latvian"],
|
18 |
+
"pol": ["pl", "Polish"],
|
19 |
+
"swe": ["sv", "Swedish"],
|
20 |
+
"ron": ["ro", "Romanian"],
|
21 |
+
"rus": ["ru", "Russian"],
|
22 |
+
"tel": ["te", "Telugu"],
|
23 |
+
"tgk": ["tg", "Tajik"],
|
24 |
+
"tur": ["tr", "Turkish"],
|
25 |
+
"tha": ["th", "Thai"],
|
26 |
+
"vie": ["vi", "Vietnamese"],
|
27 |
+
"nld": ["nl", "Dutch"],
|
28 |
+
"ukr": ["uk", "Ukrainian"]
|
29 |
+
}
|