forked from mark-watson/Java-AI-Book-Code
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathNutchClient.java
More file actions
executable file
·108 lines (100 loc) · 4.06 KB
/
NutchClient.java
File metadata and controls
executable file
·108 lines (100 loc) · 4.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.List;
import java.util.Scanner;
/**
* Nutch client
*
* <p/>
* Copyright 1996-2012 by Mark Watson. All rights reserved.
* <p/>
* This software is can be used under either of the following licenses:
* <p/>
* 1. LGPL v3<br/>
* 2. Apache 2
* <p/>
*/
public class NutchClient {
static public List<Hashtable<String,String>> searchGetCache(String opensearch_url, String query) throws IOException, ParserConfigurationException, SAXException {
return search_helper(opensearch_url, query, true);
}
static public List<Hashtable<String,String>> search(String opensearch_url, String query) throws IOException, ParserConfigurationException, SAXException {
return search_helper(opensearch_url, query, false);
}
static private List<Hashtable<String,String>> search_helper(String opensearch_url, String query, boolean return_cache) throws IOException, ParserConfigurationException, SAXException {
List<Hashtable<String,String>> ret = new ArrayList<Hashtable<String,String>>();
String url_str = opensearch_url + "?query=" + URLEncoder.encode(query, "UTF-8");
System.out.println(url_str);
URL url = new URL(url_str);
URLConnection uc = url.openConnection();
BufferedInputStream bis = new BufferedInputStream(uc.getInputStream());
DocumentBuilder docBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
Document doc = docBuilder.parse(bis);
doc.getDocumentElement().normalize();
System.out.println( doc.getDocumentElement().getTagName());
NodeList listItems = doc.getElementsByTagName("item");
int numItems = listItems.getLength();
for (int i=0; i<numItems; i++) {
Node item = listItems.item(i);
//System.out.println("\nStart of new item in RSS 2.0 XML stream:");
Hashtable<String,String> new_item = new Hashtable<String,String>();
ret.add(new_item);
NodeList item_data = item.getChildNodes();
int num = item_data.getLength();
for (int n=0; n<num; n++) {
Node data = item_data.item(n);
String name = data.getNodeName();
if (name.equals("title") || name.equals("description") ||
name.equals("link")) {
new_item.put(name, data.getTextContent());
}
if (name.equals("nutch:cache")) {
new_item.put("cache_uri", data.getTextContent());
}
// debug printout:
//if (!name.equals("#text")) {
// String text = data.getTextContent();
// System.out.println(name + ": " + text);
//}
}
if (return_cache && new_item.get("cache_uri")!=null) {
new_item.put("cache_content", getCacheContent(new_item.get("cache_uri")));
}
}
return ret;
}
static public String getCacheContent(String cache_uri) throws IOException {
URL url = new URL(cache_uri);
URLConnection uc = url.openConnection();
return new Scanner(uc.getInputStream()). useDelimiter("\\Z").next();
}
/**
* @param args
* @throws IOException
* @throws ParserConfigurationException
* @throws SAXException
*/
public static void main(String[] args) throws IOException, ParserConfigurationException, SAXException {
long t1 = System.currentTimeMillis();
List<Hashtable<String,String>> results =
NutchClient.search("http://localhost:8080/opensearch", "Java AI");
//NutchClient.searchGetCache("http://localhost:8080/opensearch", "Java RDF");
long t2 = System.currentTimeMillis();
System.out.println("results: " + results);
System.out.println(" Time in milliseconds for web service call: " + (t2 - t1));
System.out.println(results.get(0));
System.out.println(results.get(1));
}
}