-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLittleSearchEngine.java
257 lines (230 loc) · 7.95 KB
/
LittleSearchEngine.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
package lse;
import java.io.*;
import java.util.*;
/**
* This class builds an index of keywords. Each keyword maps to a set of pages in
* which it occurs, with frequency of occurrence in each page.
*
*/
public class LittleSearchEngine {
/**
* This is a hash table of all keywords. The key is the actual keyword, and the associated value is
* an array list of all occurrences of the keyword in documents. The array list is maintained in
* DESCENDING order of frequencies.
*/
HashMap<String,ArrayList<Occurrence>> keywordsIndex;
HashSet<String> noiseWords;
public LittleSearchEngine() {
keywordsIndex = new HashMap<String,ArrayList<Occurrence>>(1000,2.0f);
noiseWords = new HashSet<String>(100,2.0f);
}
public HashMap<String,Occurrence> loadKeywordsFromDocument(String docFile)
throws FileNotFoundException {
if(!(new File(docFile).exists())) {
throw new FileNotFoundException();
}
HashMap<String,Occurrence> re = new HashMap<String,Occurrence>();
Scanner scan = new Scanner(new File(docFile));
while(scan.hasNextLine()) {
String StrD = scan.nextLine();
StringTokenizer tok = new StringTokenizer(StrD, " ");
while(tok.hasMoreTokens()) {
String Tstring = tok.nextToken();
String keyWord = getKeyword(Tstring);
if(keyWord != null) {
if(re.containsKey(keyWord)) {
re.get(keyWord).frequency += 1;
}
else {
re.put(keyWord, new Occurrence(docFile,1));
}
}
}
}
scan.close();
return re;
}
/**
* Merges the keywords for a single document into the master keywordsIndex
* hash table. For each keyword, its Occurrence in the current document
* must be inserted in the correct place (according to descending order of
* frequency) in the same keyword's Occurrence list in the master hash table.
* This is done by calling the insertLastOccurrence method.
*
* @param kws Keywords hash table for a document
*/
public void mergeKeywords(HashMap<String,Occurrence> kws) {
for(String key : kws.keySet()) {
ArrayList<Occurrence> put = new ArrayList<Occurrence>();
put.add(kws.get(key));
if(keywordsIndex.get(key) == null) {
keywordsIndex.put(key, put);
}
else{
keywordsIndex.get(key).add(kws.get(key));
}
insertLastOccurrence(keywordsIndex.get(key));
}
}
public String getKeyword(String word) {
while(word.length() != 0 && (word.charAt(word.length()-1) == '.' || word.charAt(word.length()-1) == ',' || word.charAt(word.length()-1) == '!' || word.charAt(word.length()-1) == '?' || word.charAt(word.length()-1) == ':' || word.charAt(word.length()-1) == ';')) {
word = word.substring(0, word.length()-1);
}
word = word.toLowerCase();
if(word.length() == 0 || noiseWords.contains(word)) {
return null;
}
if(word.matches("[a-z]*")) {
return word;
}
return null;
}
/**
* Inserts the last occurrence in the parameter list in the correct position in the
* list, based on ordering occurrences on descending frequencies. The elements
* 0..n-2 in the list are already in the correct order. Insertion is done by
* first finding the correct spot using binary search, then inserting at that spot.
*
* @param occs List of Occurrences
* @return Sequence of mid point indexes in the input list checked by the binary search process,
* null if the size of the input list is 1. This returned array list is only used to test
* your code - it is not used elsewhere in the program.
*/
public ArrayList<Integer> insertLastOccurrence(ArrayList<Occurrence> occs) {
if(occs.size() == 1)
return null;
ArrayList<Integer> re = new ArrayList<Integer>();
Occurrence holdOc = occs.get(occs.size()-1);
occs.remove(occs.size()-1);
int first = 0;
int last = occs.size()-1;
if(occs.size() == 1) {
re.add(0);
if(occs.get(0).frequency <= holdOc.frequency)
occs.add(0,holdOc);
else {
occs.add(holdOc);
}
return re;
}
int middle;
boolean go = true;
while(go) {
middle = ((last-first)/2) + first;
re.add(middle);
if(first >= last) {
if(occs.get(middle).frequency >= holdOc.frequency) {
occs.add(middle + 1, holdOc);
break;
}
else {
occs.add(middle, holdOc);
break;
}
}
if(occs.get(middle).frequency == holdOc.frequency) {
occs.add(middle, holdOc);
break;
}
else if(occs.get(middle).frequency > holdOc.frequency) {
first = middle + 1;
}
else {
last = middle - 1;
}
}
return re;
}
/**
* This method indexes all keywords found in all the input documents. When this
* method is done, the keywordsIndex hash table will be filled with all keywords,
* each of which is associated with an array list of Occurrence objects, arranged
* in decreasing frequencies of occurrence.
*
* @param docsFile Name of file that has a list of all the document file names, one name per line
* @param noiseWordsFile Name of file that has a list of noise words, one noise word per line
* @throws FileNotFoundException If there is a problem locating any of the input files on disk
*/
public void makeIndex(String docsFile, String noiseWordsFile)
throws FileNotFoundException {
// load noise words to hash table for noiseWords
Scanner sc = new Scanner(new File(noiseWordsFile));
while (sc.hasNext()) {
String word = sc.next();
noiseWords.add(word);
}
// index all keywords
sc = new Scanner(new File(docsFile));
while (sc.hasNext()) {
String docFile = sc.next();
HashMap<String,Occurrence> kws = loadKeywordsFromDocument(docFile); /*inputs document and returns all hey words**/
mergeKeywords(kws);
}
sc.close();
}
/**
* Search result for "kw1 or kw2". A document is in the result set if kw1 or kw2 occurs in that
* document. Result set is arranged in descending order of document frequencies. (Note that a
* matching document will only appear once in the result.) Ties in frequency values are broken
* in favor of the first keyword. (That is, if kw1 is in doc1 with frequency f1, and kw2 is in doc2
* also with the same frequency f1, then doc1 will take precedence over doc2 in the result.
* The result set is limited to 5 entries. If there are no matches at all, result is null.
*
* @param kw1 First keyword
* @param kw1 Second keyword
* @return List of documents in which either kw1 or kw2 occurs, arranged in descending order of
* frequencies. The result size is limited to 5 documents. If there are no matches, returns null.
*/
public ArrayList<String> top5search(String kw1, String kw2) {
ArrayList<Occurrence> KW1 = new ArrayList<Occurrence>();
ArrayList<Occurrence> KW2 = new ArrayList<Occurrence>();
int i;
ArrayList<String> re = new ArrayList<String>();
if(keywordsIndex.get(kw1) != null) {
for(i = 0; i < keywordsIndex.get(kw1).size() && i<5; i++) {
KW1.add(keywordsIndex.get(kw1).get(i));
}
}
if(keywordsIndex.get(kw2) != null) {
for(i = 0; i < keywordsIndex.get(kw2).size() && i<5; i++) {
if(keywordsIndex.get(kw2).get(i) != null) {
KW2.add(keywordsIndex.get(kw2).get(i));
}
}
}
int Counter1 = 0;
int Counter2 = 0;
while(re.size() != 5) {
if(Counter1 == KW1.size()) {
for(int j = Counter2; re.size() != 5 && j< KW2.size(); j++) {
if(!(re.contains(KW2.get(j).document)))
re.add(KW2.get(j).document);
}
break;
}
if(Counter2 == KW2.size()) {
for(int j = Counter1; re.size() != 5 && j< KW1.size(); j++) {
if(!(re.contains(KW1.get(j).document)))
re.add(KW1.get(j).document);
}
break;
}
if(KW1.get(Counter1).frequency > KW2.get(Counter2).frequency) {
if(!(re.contains(KW1.get(Counter1).document)))
re.add(KW1.get(Counter1).document);
Counter1++;
}
else if(KW1.get(Counter1).frequency < KW2.get(Counter2).frequency) {
if(!(re.contains(KW2.get(Counter2).document)))
re.add(KW2.get(Counter2).document);
Counter2++;
}
else {
if(!(re.contains(KW1.get(Counter1).document)))
re.add(KW1.get(Counter1).document);
Counter1++;
}
}
return re;
}
}