Spaces:
Build error
Build error
sdhanabal1
commited on
Commit
·
c98407b
1
Parent(s):
c2b444a
Revert to nltk word tokenizer
Browse files- Summarizer.py +3 -3
Summarizer.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import string
|
|
|
2 |
|
3 |
from sumy.parsers import DocumentParser
|
4 |
from sumy.parsers.html import HtmlParser
|
@@ -7,13 +8,12 @@ from sumy.nlp.tokenizers import Tokenizer
|
|
7 |
from sumy.nlp.stemmers import Stemmer
|
8 |
from sumy.summarizers.lsa import LsaSummarizer
|
9 |
from sumy.utils import get_stop_words
|
10 |
-
from transformers import Pipeline
|
11 |
|
12 |
|
13 |
class Summarizer:
|
14 |
DEFAULT_LANGUAGE = "english"
|
15 |
DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10
|
16 |
-
TOKENIZER = BertTokenizer.from_pretrained('bert-base-cased')
|
17 |
STOP_WORDS = list(get_stop_words(language=DEFAULT_LANGUAGE)) + list(string.punctuation)
|
18 |
|
19 |
def __init__(self, pipeline: Pipeline):
|
@@ -40,7 +40,7 @@ class Summarizer:
|
|
40 |
cumulative_token_length = 0
|
41 |
for sentence in summary_sentences:
|
42 |
result_list.append(sentence)
|
43 |
-
token_list =
|
44 |
token_words = [token for token in token_list if token.lower() not in Summarizer.STOP_WORDS]
|
45 |
token_length = len(token_words)
|
46 |
if token_length + cumulative_token_length >= max_token_length:
|
|
|
1 |
import string
|
2 |
+
import nltk
|
3 |
|
4 |
from sumy.parsers import DocumentParser
|
5 |
from sumy.parsers.html import HtmlParser
|
|
|
8 |
from sumy.nlp.stemmers import Stemmer
|
9 |
from sumy.summarizers.lsa import LsaSummarizer
|
10 |
from sumy.utils import get_stop_words
|
11 |
+
from transformers import Pipeline
|
12 |
|
13 |
|
14 |
class Summarizer:
|
15 |
DEFAULT_LANGUAGE = "english"
|
16 |
DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10
|
|
|
17 |
STOP_WORDS = list(get_stop_words(language=DEFAULT_LANGUAGE)) + list(string.punctuation)
|
18 |
|
19 |
def __init__(self, pipeline: Pipeline):
|
|
|
40 |
cumulative_token_length = 0
|
41 |
for sentence in summary_sentences:
|
42 |
result_list.append(sentence)
|
43 |
+
token_list = nltk.word_tokenize(sentence)
|
44 |
token_words = [token for token in token_list if token.lower() not in Summarizer.STOP_WORDS]
|
45 |
token_length = len(token_words)
|
46 |
if token_length + cumulative_token_length >= max_token_length:
|