sdhanabal1 commited on
Commit
4372d93
·
1 Parent(s): c98407b

Update min length to 24 tokens

Browse files
Files changed (3) hide show
  1. Summarizer.py +6 -9
  2. app.py +1 -1
  3. test_summarizer.py +3 -3
Summarizer.py CHANGED
@@ -1,4 +1,3 @@
1
- import string
2
  import nltk
3
 
4
  from sumy.parsers import DocumentParser
@@ -13,8 +12,7 @@ from transformers import Pipeline
13
 
14
  class Summarizer:
15
  DEFAULT_LANGUAGE = "english"
16
- DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10
17
- STOP_WORDS = list(get_stop_words(language=DEFAULT_LANGUAGE)) + list(string.punctuation)
18
 
19
  def __init__(self, pipeline: Pipeline):
20
  self.pipeline = pipeline
@@ -34,16 +32,15 @@ class Summarizer:
34
  return " ".join([sentence for sentence in summary_sentences])
35
 
36
  @staticmethod
37
- def split_sentences_by_token_length(summary_sentences: list, max_token_length: int) -> list:
38
  accumulated_lists = []
39
  result_list = []
40
  cumulative_token_length = 0
41
  for sentence in summary_sentences:
42
  result_list.append(sentence)
43
  token_list = nltk.word_tokenize(sentence)
44
- token_words = [token for token in token_list if token.lower() not in Summarizer.STOP_WORDS]
45
- token_length = len(token_words)
46
- if token_length + cumulative_token_length >= max_token_length:
47
  accumulated_lists.append(Summarizer.join_sentences(result_list))
48
  result_list = []
49
  cumulative_token_length = 0
@@ -72,9 +69,9 @@ class Summarizer:
72
  :return: List of abstractive summary of sentences after calling distilbart-tos-summarizer-tosdr tokenizer
73
  """
74
  wrapped_sentences = Summarizer.split_sentences_by_token_length(extract_summary_sentences,
75
- max_token_length=1000)
76
  # The ml6team/distilbart-tos-summarizer-tosdr tokenizer supports a max of 1024 tokens per input
77
  abstractive_summary_list = []
78
- for result in self.pipeline(wrapped_sentences, min_length=5, max_length=512):
79
  abstractive_summary_list.append(result['summary_text'])
80
  return abstractive_summary_list
 
 
1
  import nltk
2
 
3
  from sumy.parsers import DocumentParser
 
12
 
13
  class Summarizer:
14
  DEFAULT_LANGUAGE = "english"
15
+ DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 15
 
16
 
17
  def __init__(self, pipeline: Pipeline):
18
  self.pipeline = pipeline
 
32
  return " ".join([sentence for sentence in summary_sentences])
33
 
34
  @staticmethod
35
+ def split_sentences_by_token_length(summary_sentences: list, split_token_length: int) -> list:
36
  accumulated_lists = []
37
  result_list = []
38
  cumulative_token_length = 0
39
  for sentence in summary_sentences:
40
  result_list.append(sentence)
41
  token_list = nltk.word_tokenize(sentence)
42
+ token_length = len(token_list)
43
+ if token_length + cumulative_token_length >= split_token_length:
 
44
  accumulated_lists.append(Summarizer.join_sentences(result_list))
45
  result_list = []
46
  cumulative_token_length = 0
 
69
  :return: List of abstractive summary of sentences after calling distilbart-tos-summarizer-tosdr tokenizer
70
  """
71
  wrapped_sentences = Summarizer.split_sentences_by_token_length(extract_summary_sentences,
72
+ split_token_length=512)
73
  # The ml6team/distilbart-tos-summarizer-tosdr tokenizer supports a max of 1024 tokens per input
74
  abstractive_summary_list = []
75
+ for result in self.pipeline(wrapped_sentences, min_length=24, max_length=512):
76
  abstractive_summary_list.append(result['summary_text'])
77
  return abstractive_summary_list
app.py CHANGED
@@ -19,7 +19,7 @@ def main() -> None:
19
  "Well don't worry, neither do we! That's why we created a <b>Terms & Conditions Summarization</b> algorithm!", unsafe_allow_html=True)
20
  st.markdown('Just copy-paste that pesky Terms & Conditions text or provide a URL to the text and let our fancy NLP algorithm do the rest!<br>'
21
  'You will see both an extractive summary (the most important sentences will be highlighted) and an abstractive summary (an actual summary)<br>'
22
- 'Now you can just take a quick glanse at the summary and go about the rest of your day assured that no one is abusing your precious personal data :books:', unsafe_allow_html=True)
23
  st.markdown('<b>Want to find out more?</b> :brain:<br>'
24
  'For details about the extractive part :point_right: https://en.wikipedia.org/wiki/Latent_semantic_analysis<br>'
25
  'For details about the abstractive part :point_right: https://huggingface.co/ml6team/distilbart-tos-summarizer-tosdr', unsafe_allow_html=True)
 
19
  "Well don't worry, neither do we! That's why we created a <b>Terms & Conditions Summarization</b> algorithm!", unsafe_allow_html=True)
20
  st.markdown('Just copy-paste that pesky Terms & Conditions text or provide a URL to the text and let our fancy NLP algorithm do the rest!<br>'
21
  'You will see both an extractive summary (the most important sentences will be highlighted) and an abstractive summary (an actual summary)<br>'
22
+ 'The abstractive summary will give you an idea of what the key message of the document likely is :bulb:', unsafe_allow_html=True)
23
  st.markdown('<b>Want to find out more?</b> :brain:<br>'
24
  'For details about the extractive part :point_right: https://en.wikipedia.org/wiki/Latent_semantic_analysis<br>'
25
  'For details about the abstractive part :point_right: https://huggingface.co/ml6team/distilbart-tos-summarizer-tosdr', unsafe_allow_html=True)
test_summarizer.py CHANGED
@@ -8,19 +8,19 @@ def test_split_sentences_by_token_length():
8
  'Free.'
9
  ]
10
 
11
- split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, max_token_length=3)
12
  assert split_sentences == [
13
  'Python is a programming language.',
14
  'Memory allocation. Free.'
15
  ]
16
 
17
- split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, max_token_length=5)
18
  assert split_sentences == [
19
  'Python is a programming language. Memory allocation.',
20
  'Free.'
21
  ]
22
 
23
- split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, max_token_length=10)
24
  assert split_sentences == [
25
  'Python is a programming language. Memory allocation. Free.'
26
  ]
 
8
  'Free.'
9
  ]
10
 
11
+ split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, split_token_length=5)
12
  assert split_sentences == [
13
  'Python is a programming language.',
14
  'Memory allocation. Free.'
15
  ]
16
 
17
+ split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, split_token_length=7)
18
  assert split_sentences == [
19
  'Python is a programming language. Memory allocation.',
20
  'Free.'
21
  ]
22
 
23
+ split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, split_token_length=10)
24
  assert split_sentences == [
25
  'Python is a programming language. Memory allocation. Free.'
26
  ]