2010年10月19日 星期二

net.htmlparser.jericho

下載 jericho-html-3.1.jar

import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.HTMLElementName;
import net.htmlparser.jericho.Source;


public static void getWebPage() throws IOException, ClassNotFoundException,
SQLException, IllegalArgumentException, FeedException, InterruptedException {
String host = "http://www.xxx.edu.tw/lp.asp?mp=104021/";
Source source = www.getWebPage(host);
String html = source.toString();

//抓網址
Pattern pattern = Pattern.compile("hello.asp\\?id=\\d{1,}");
Matcher matcher = pattern.matcher(html);
Set set = new HashSet();
while (matcher.find()) { //
System.out.println("比對符合 " + matcher.group());
set.add(matcher.group());
}

//網址唯一化
Iterator iterator = set.iterator();
String sss = null;
while (iterator.hasNext()) {
sss = iterator.next().toString();
//getDetail("http://www.xxx.edu.tw/"+sss);
}

}


public static void getDetail(String host) throws IOException,
ClassNotFoundException, SQLException, IllegalArgumentException,
FeedException, InterruptedException {
Source source = www.getWebPage(host);
List eTable = source.getAllElements(HTMLElementName.DIV);
for(Element e : eTable)
{
if (e.getAttributeValue("class")!=null)
{
if(e.getAttributeValue("class").equals("cp"))
{
System.out.println("cp "+e.getContent().toString());
//System.out.println("cp "+e.getRenderer().toString());
//System.out.println("h2 "+e.getAllElements(HTMLElementName.H2).get(0).getRenderer());
//System.out.println("li "+e.getAllElements(HTMLElementName.LI).get(2).getRenderer());
}
}
}//end fro
}
//----------------------------

public static Source getWebPage(String host) throws IOException, ClassNotFoundException,
SQLException, IllegalArgumentException, FeedException {
InetSocketAddress ISA = new java.net.InetSocketAddress("192.16.1.5",
3000);
Proxy proxy = new java.net.Proxy(java.net.Proxy.Type.HTTP, ISA);
URL srcUrl = new URL(host);
Source source = null;
InetAddress address = InetAddress.getLocalHost();
String addr = address.getHostAddress();
if (addr.matches("10.10.*")) {
source = new Source(srcUrl.openConnection(proxy));
} else {
source = new Source(srcUrl);
}
return source;
}

沒有留言: