Monday, November 15, 2010

Java crawler

package com.myjobalert.crawler;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;

public abstract class Crawler {

public String getURLContents(String urlStr, HttpParam[] httpParams) throws IOException {
URL url = new URL(urlStr);
URLConnection conn = url.openConnection();
HttpURLConnection httpconn = (HttpURLConnection) conn;

if (httpParams != null) {
httpconn.setRequestMethod("POST");
for (HttpParam param : httpParams) {
httpconn.setRequestProperty(param.getName(), param.getValue());
}
conn = httpconn;
}
InputStreamReader bis = new InputStreamReader(conn.getInputStream());
System.out.println("page size:"+conn.getContentLength());
final int char_per_page = 5000;
char[] buff = new char[char_per_page];
StringBuilder sb = new StringBuilder(char_per_page);
int read = 0;
while (read != -1) {
read = bis.read(buff);
if (read != -1)
sb.append(buff, 0, read);
}
System.out.println("page size:"+(sb.length()/1000)+"KB");
System.out.println("page is downloaded");
return sb.toString();
}
}

class HttpParam {

private String name;
private String value;

public String getName() {
return name;
}

public void setName(String name) {
this.name = name;
}

public String getValue() {
return value;
}

public void setValue(String value) {
this.value = value;
}
}

http://ws.amazon.com/widgets/q?rt=qf_br_asin_ssw&ServiceVersion=20070822&MarketPlace=US&ID=V20070822%2FUS%2Fmyknowledgebo-20%2F8003%2F13de8fda-f06a-4d41-9041-b4aeab627f25&Operation=GetDisplayTemplate Amazon.com Widgets

No comments:

Post a Comment