pierreguillou
commited on
Commit
·
cf141c0
1
Parent(s):
afb4867
Update app.py
Browse files
app.py
CHANGED
@@ -23,13 +23,9 @@ kw_model = {
|
|
23 |
|
24 |
## KeyphraseVectorizers
|
25 |
# source: https://github.com/TimSchopf/KeyphraseVectorizers#keyphrasevectorizers
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
os.system("python -m spacy download pt_core_news_lg")
|
30 |
-
|
31 |
-
# Part-of-Speech Tagging for Portuguese (https://melaniewalsh.github.io/Intro-Cultural-Analytics/05-Text-Analysis/Multilingual/Portuguese/03-POS-Keywords-Portuguese.html)
|
32 |
-
pos_pattern='<CONJ.*>*<ADP.*>*<ADV.*>*<NUM.*>*<ADJ.*>*<N.*>+'
|
33 |
|
34 |
# download stop words in Portuguese
|
35 |
#import nltk
|
@@ -37,8 +33,11 @@ pos_pattern='<CONJ.*>*<ADP.*>*<ADV.*>*<NUM.*>*<ADJ.*>*<N.*>+'
|
|
37 |
#from nltk.corpus import stopwords
|
38 |
#stop_words = set(stopwords.words('portuguese'))
|
39 |
|
|
|
|
|
|
|
40 |
# define o vectorizer
|
41 |
-
vectorizer = KeyphraseCountVectorizer(spacy_pipeline='pt_core_news_lg',
|
42 |
|
43 |
# function principal (keywords)
|
44 |
def get_kw_html(model_id, doc, top_n, diversity):
|
|
|
23 |
|
24 |
## KeyphraseVectorizers
|
25 |
# source: https://github.com/TimSchopf/KeyphraseVectorizers#keyphrasevectorizers
|
26 |
+
# download spacy pipeline (https://spacy.io/models/pt)
|
27 |
+
# source: https://melaniewalsh.github.io/Intro-Cultural-Analytics/05-Text-Analysis/Multilingual/Portuguese/03-POS-Keywords-Portuguese.html
|
28 |
+
# os.system("python -m spacy download pt_core_news_lg")
|
|
|
|
|
|
|
|
|
29 |
|
30 |
# download stop words in Portuguese
|
31 |
#import nltk
|
|
|
33 |
#from nltk.corpus import stopwords
|
34 |
#stop_words = set(stopwords.words('portuguese'))
|
35 |
|
36 |
+
# Part-of-Speech Tagging for Portuguese (https://melaniewalsh.github.io/Intro-Cultural-Analytics/05-Text-Analysis/Multilingual/Portuguese/03-POS-Keywords-Portuguese.html)
|
37 |
+
pos_pattern='<CONJ.*>*<ADP.*>*<ADV.*>*<NUM.*>*<ADJ.*>*<N.*>+'
|
38 |
+
|
39 |
# define o vectorizer
|
40 |
+
vectorizer = KeyphraseCountVectorizer(spacy_pipeline='pt_core_news_lg', stop_words=None, pos_pattern=pos_pattern, lowercase=False)
|
41 |
|
42 |
# function principal (keywords)
|
43 |
def get_kw_html(model_id, doc, top_n, diversity):
|