Code:
(gdb) run handhelditems 1 1000
Starting program: /home/wqq/crawl/handhelditems/crawl_what_i_want handhelditems 1 1000
[Thread debugging using libthread_db enabled]
[New Thread -1208232256 (LWP 3796)]
[New Thread -1210332272 (LWP 3799)]
[New Thread -1220822128 (LWP 3800)]
[New Thread -1231311984 (LWP 3801)]
[New Thread -1243206768 (LWP 3802)]
[New Thread -1253696624 (LWP 3803)]
[New Thread -1264186480 (LWP 3804)]
[New Thread -1274676336 (LWP 3805)]
[New Thread -1285166192 (LWP 3806)]
[New Thread -1295656048 (LWP 3807)]
[New Thread -1306145904 (LWP 3808)]
[New Thread -1316635760 (LWP 3809)]
Program received signal SIGPWR, Power fail/restart.
[Switching to Thread -1316635760 (LWP 3809)]
0x00196402 in __kernel_vsyscall ()
(gdb)
-------------------------------------------
the crawl_what_i_want.java is :
-------------------------------------------
public class crawl_what_i_want {
public static void main(String[] args){
if(args.length!=3){
System.out.println("Usage: java crawl_html [site name] [start at] [end at]");
}
else
{
String v1 = args[0];
int v2 = Integer.parseInt(args[1]);
int v3 = Integer.parseInt(args[2]);
thread thread0 = new thread(v1,v2,v3,0);
thread thread1 = new thread(v1,v2,v3,1);
thread thread2 = new thread(v1,v2,v3,2);
thread thread3 = new thread(v1,v2,v3,3);
thread thread4 = new thread(v1,v2,v3,4);
thread thread5 = new thread(v1,v2,v3,5);
thread thread6 = new thread(v1,v2,v3,6);
thread thread7 = new thread(v1,v2,v3,7);
thread thread8 = new thread(v1,v2,v3,8);
thread thread9 = new thread(v1,v2,v3,9);
thread0.start();
thread1.start();
thread2.start();
thread3.start();
thread4.start();
thread5.start();
thread6.start();
thread7.start();
thread8.start();
thread9.start();
}
}
}
---------------------------------------------
the thread.java is :
---------------------------------------------
import java.io.*;
import java.sql.*;
import java.util.*;
import java.net.*;
public class thread extends Thread
{
String site_n;
int s_at;
int e_at;
int for_id;
thread(String site_n,int s_at,int e_at,int for_id){
this.site_n = site_n;
this.s_at = s_at;
this.e_at = e_at;
this.for_id = for_id;
}
public synchronized void run() {
String site = site_n;
int fornum = for_id;
int start = s_at;
int end = e_at;
//String site_name = "http://www."+site+".com";
ArrayList url_list = new ArrayList();
ArrayList url_id_list = new ArrayList();
try{
Class.forName("com.mysql.jdbc.Driver").newInstance();
String getconn = "jdbc:mysql://localhost/url_set_"+site+"?user=xxxx&password=xxxx";
Connection conn = DriverManager.getConnection(getconn);
PreparedStatement stmt = conn.prepareStatement("");
ResultSet rs = null;
stmt = conn.prepareStatement("select url,url_id from urls where url_id>=? and url_id<=?");
stmt.setInt(1,start);
stmt.setInt(2,end);
rs = stmt.executeQuery();
while(rs.next())
{
url_id_list.add(rs.getString(2));
url_list.add(rs.getString(1));
}
if(rs!=null)
rs.close();
stmt.close();
conn.close();
}
catch(Exception e){
e.printStackTrace();
}
try{
InputStream in = null;
InputStreamReader rd = null;
BufferedReader br = null;
for(int i=fornum; i<url_list.size(); i=i+10){
String save_dir = "./" + String.valueOf(Integer.parseInt(url_id_list.get(i).toString())/10000) + "/" ;
try{
if(!(new File(save_dir).isDirectory()))
new File(save_dir).mkdir();
}
catch(Exception exp){
exp.printStackTrace();
}
String html = "";
URL this_url = new URL(url_list.get(i).toString());
in = this_url.openConnection().getInputStream();
rd = new InputStreamReader(in);
br=new BufferedReader(rd);
String line = br.readLine();
int imgmark = 0 ;
while(line != null){
html += line + (char)13;
/////////////////
// get image //
/////////////////
if(line.indexOf("product image(s) bof")>0)
{
imgmark = 1;
}
if(line.indexOf("product image(s) eof")>0)
{
imgmark = 0;
}
if(imgmark == 1 && line.indexOf("img src")>=0)
{
int bofimg = line.indexOf("https://www.unix.com/images/");
int eofimg = line.indexOf("jpg\"")+3;
if(eofimg>bofimg){
String imgURL = "http://www."+site+".com/" + line.substring(bofimg,eofimg);
String imgdir = save_dir + url_id_list.get(i).toString() + ".jpg";
getpic gp = new getpic();
gp.crawlpic(imgURL,imgdir);
}
}
line = br.readLine();
}
br.close();
rd.close();
in.close();
if(html==null)
html = "";
if(html.length()>0){
String saveTo = save_dir+ url_id_list.get(i).toString() +".html";
try {
new outPut(html, saveTo);
} catch (IOException e) {
e.printStackTrace();
}
System.out.println((i+start) + ". Saved " + url_list.get(i) + " as " + (i+start) + ".html");
}
else
System.out.println((i+start) + ". failed at " + url_list.get(i));
}
if(br!=null)
br.close();
}catch (Exception e){
e.printStackTrace();
}
}
}
------------------------------------------
the outPut.java is :
------------------------------------------
import java.io.*;
public class outPut {
public outPut(String content, String outPutFile)throws IOException{
FileWriter fl = null;
BufferedWriter bw = null;
try{
File f = new File(outPutFile);
if(!f.exists())
f.createNewFile();
fl = new FileWriter(outPutFile);
bw = new BufferedWriter(fl);
bw.write(content);
}
finally{
if(bw!=null)
bw.flush();
if(fl!=null)
fl.flush();
if(bw!=null)
bw.close();
if(fl!=null)
fl.close();
}
}
}
----------------------------
the getpic.java is
----------------------------
import java.io.*;
import java.net.*;
public class getpic {
public synchronized void crawlpic(String url, String savedir)throws Exception {
InputStream in = null;
InputStream inBuffer = null;
OutputStream out = null;
try{
URL this_url = new URL(url);
in = this_url.openConnection().getInputStream();
inBuffer = new BufferedInputStream(in);
out = new FileOutputStream(savedir);
while(true){
int bytedata = inBuffer.read();
if(bytedata == -1)
break;
out.write(bytedata);
}
}
finally{
if(out != null)
out.close();
if(inBuffer != null)
inBuffer.close();
if(in != null)
in.close();
}
}
}