-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtextsummary.py
91 lines (65 loc) · 2.68 KB
/
textsummary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#importing libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
def _create_dictionary_table(text_string) -> dict:
#removing stop words
stop_words = set(stopwords.words("english"))
words = word_tokenize(text_string)
#creating dictionary for the word frequency table
frequency_table = dict()
for wd in words:
wd = wd.lower()
# wd = stem.stem(wd)
if wd in stop_words:
continue
if wd in frequency_table:
frequency_table[wd] += 1
else:
frequency_table[wd] = 1
return frequency_table
def _calculate_sentence_scores(sentences, frequency_table) -> dict:
#algorithm for scoring a sentence by its words
sentence_weight = dict()
for sentence in sentences:
for word in nltk.word_tokenize(sentence.lower()):
if word in frequency_table.keys():
if sentence in sentence_weight:
sentence_weight[sentence] += frequency_table[word]
else:
sentence_weight[sentence] = frequency_table[word]
return sentence_weight
def _calculate_average_score(sentence_weight) -> int:
#calculating the average score for the sentences
sum_values = 0
for entry in sentence_weight:
sum_values += sentence_weight[entry]
#getting sentence average value from source text
average_score = (sum_values / len(sentence_weight))
return average_score
def _get_article_summary(sentences, sentence_weight, threshold):
sentence_counter = 0
article_summary = ''
for sentence in sentences:
if sentence in sentence_weight and sentence_weight[sentence] >= (threshold):
article_summary += " " + sentence
sentence_counter += 1
return article_summary
def _run_article_summary(article):
#creating a dictionary for the word frequency table
frequency_table = _create_dictionary_table(article)
#tokenizing the sentences
sentences = sent_tokenize(article)
#algorithm for scoring a sentence by its words
sentence_scores = _calculate_sentence_scores(sentences, frequency_table)
#getting the threshold
threshold = _calculate_average_score(sentence_scores)
#producing the summary
article_summary = _get_article_summary(sentences, sentence_scores, 1.2 * threshold)
return article_summary
def final_text_summary(text) :
original_text = text
summary_results = _run_article_summary(original_text)
count_original = len(original_text.split())
count_summary = len(summary_results.split())
return original_text, summary_results, count_original, count_summary