forked from mark-watson/Java-AI-Book-Code
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathInMemorySearch.java
More file actions
executable file
·138 lines (129 loc) · 4.22 KB
/
InMemorySearch.java
File metadata and controls
executable file
·138 lines (129 loc) · 4.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
package textsearch;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.TreeSet;
public class InMemorySearch {
private final int MAX_WORDS_PER_DOCUMENT = 1000;
public static void main(String[] args) {
InMemorySearch ims = new InMemorySearch();
TestDocument doc1 = new TestDocument("This is a test for index and a test for search.");
ims.add(doc1);
TestDocument doc2 = new TestDocument("Please test the index code.");
ims.add(doc2);
TestDocument doc3 = new TestDocument("Please test the index code before tomorrow.");
ims.add(doc3);
ims.debug();
List<Integer> search_results = ims.search("test index", 2);
System.out.println("\nSearch results (doc IDs): " + search_results);
}
public void add(TestDocument document) {
Map<String,Integer> wcount = new Hashtable<String,Integer>();
StringTokenizer st = new StringTokenizer(document.text.toLowerCase(), " .,;:!");
int num_words = st.countTokens();
if (num_words == 0) return;
while (st.hasMoreTokens()) {
String word = st.nextToken();
System.out.println(word);
if (wcount.containsKey(word)) {
wcount.put(word, wcount.get(word) + (MAX_WORDS_PER_DOCUMENT / num_words));
} else {
wcount.put(word, MAX_WORDS_PER_DOCUMENT / num_words);
}
}
for (String word : wcount.keySet()) {
TreeSet<IdCount> ts;
if (index.containsKey(word)) {
ts = index.get(word);
} else {
ts = new TreeSet<IdCount>();
index.put(word, ts);
}
ts.add(new IdCount(document.id, wcount.get(word) * MAX_WORDS_PER_DOCUMENT / num_words));
}
}
public List<Integer> search(String search_terms) {
return search(search_terms, 10);
}
public List<Integer> search(String search_terms, int max_terms) {
List<Integer> ret = new ArrayList<Integer>(10);
// temporary tree set to keep ordered search results:
final Map<Integer,Integer> ordered_results = new Hashtable<Integer,Integer>(0);
StringTokenizer st = new StringTokenizer(search_terms.toLowerCase(), " .,;:!");
while (st.hasMoreTokens()) {
String word = st.nextToken();
Iterator<IdCount> word_counts = index.get(word).iterator();
while (word_counts.hasNext()) {
IdCount ts = word_counts.next();
Integer id = ts.id;
if (ordered_results.containsKey(id)) {
ordered_results.put(id, ordered_results.get(id) + ts.count);
} else {
ordered_results.put(id, ts.count);
}
}
}
List<Integer> keys = new ArrayList<Integer>(ordered_results.keySet());
Collections.sort(keys, new Comparator<Integer>() {
public int compare(Integer a, Integer b) {
return -ordered_results.get(a).compareTo(ordered_results.get(b)) ;
}
});
int count = 0;
result_loop:
for (Integer id : keys) {
if (count++ >= max_terms) break result_loop;
ret.add(id);
}
return ret;
}
public void debug() {
System.out.println("*** Debug: dump of search index:\n");
for (String word : index.keySet()) {
System.out.println("\n* " + word);
TreeSet<IdCount> ts = index.get(word);
Iterator<IdCount> iter = ts.iterator();
while (iter.hasNext()) {
System.out.println(" " + iter.next());
}
}
}
private Map<String,TreeSet<IdCount>> index =
new Hashtable<String, TreeSet<IdCount>>();
}
class TestDocument {
int id;
String text;
static int count = 0;
TestDocument(String text) {
this.text = text;
id = count++;
}
public String toString() {
int len = text.length();
if (len > 25) len = 25;
return "[Document id: " + id + ": " + text.substring(0,len) + "...]";
}
}
class IdCount implements Comparable<IdCount> {
int id = 0;
int count = 0;
public IdCount(int k, int v) {
this.id = k;
this.count = v;
}
public String toString() {
return "[IdCount: " + id + " : " + count + "]";
}
public int compareTo(IdCount o) {
// don't use i1-i2: avoid overflows
if (o.count == count) return 0;
if (o.count > count) return 1;
return -1;
}
}