熱點推薦:
您现在的位置: 電腦知識網 >> 編程 >> Java編程 >> Java核心技術 >> 正文

Java抓圖程序的實現

2013-11-23 18:49:19  來源: Java核心技術 

  主要難點:

  並發線程的控制 采用了JDK的UTIL包裡的concurrent子包

  去重

  序列化

  運行方法:java XmsM XmxM jar JavaCrawlerjar C:/alog D:/pic D:/urltmp D:/imgtmp

  SimpleBloomFilterjava

  view plaincopy to clipboardprint?

  ····················································································································package comhengkingcrawl;

  import javaioSerializable;

  import javautilBitSet;

  

  public class SimpleBloomFilter  implements Serializable {

  

  

          /**

       *

       */

      private static final long serialVersionUID = L;

          private   final int   DEFAULT_SIZE = << ;

          private   final int[] seeds        = new int[] { };

          private  BitSet             bits         = new BitSet(DEFAULT_SIZE);

          private   SimpleHash[]       func         = new SimpleHash[seedslength];

  //      public  void main(String[] args) {

  //          String value = ;

  //          SimpleBloomFilter filter = new SimpleBloomFilter();

  //          Systemoutprintln(ntains(value));

  //          filteradd(value);

  //          Systemoutprintln(ntains(value));

  //      }

          public SimpleBloomFilter() {

              for (int i = ; i < seedslength; i++) {

                  func[i] = new SimpleHash(DEFAULT_SIZE seeds[i]);

              }

          }

          public void add(String value) {

              for (SimpleHash f : func) {

                  bitsset(fhash(value) true);

              }

          }

          public boolean contains(String value) {

              if (value == null) {

                  return false;

              }

              boolean ret = true;

              for (SimpleHash f : func) {

                  ret = ret && bitsget(fhash(value));

              }

              return ret;

          }

  

      public   class SimpleHash implements Serializable {

          private int cap;

          private int seed;

          public SimpleHash(int cap int seed) {

              thiscap = cap;

              thisseed = seed;

          }

          public int hash(String value) {

              int result = ;

              int len = valuelength();

              for (int i = ; i < len; i++) {

                  result = seed * result + valuecharAt(i);

              }

              return (cap ) & result;

          }

      }

      @Override

      public String toString() {

          // TODO Autogenerated method stub

          return supertoString();

      }

  

  

  }

  package comhengkingcrawl;

  import javaioSerializable;

  import javautilBitSet;

  public class SimpleBloomFilter  implements Serializable {

  /**

  *

  */

  private static final long serialVersionUID = L;

  private   final int   DEFAULT_SIZE = << ;

  private   final int[] seeds        = new int[] { };

  private  BitSet             bits         = new BitSet(DEFAULT_SIZE);

  private   SimpleHash[]       func         = new SimpleHash[seedslength];

  //     public  void main(String[] args) {

  //         String value = ;

  //         SimpleBloomFilter filter = new SimpleBloomFilter();

  //         Systemoutprintln(ntains(value));

  //         filteradd(value);

  //         Systemoutprintln(ntains(value));

  //     }

  public SimpleBloomFilter() {

  for (int i = ; i < seedslength; i++) {

  func[i] = new SimpleHash(DEFAULT_SIZE seeds[i]);

  }

  }

  public void add(String value) {

  for (SimpleHash f : func) {

  bitsset(fhash(value) true);

  }

  }

  public boolean contains(String value) {

  if (value == null) {

  return false;

  }

  boolean ret = true;

  for (SimpleHash f : func) {

  ret = ret && bitsget(fhash(value));

  }

  return ret;

  }

  public   class SimpleHash implements Serializable {

  private int cap;

  private int seed;

  public SimpleHash(int cap int seed) {

  thiscap = cap;

  thisseed = seed;

  }

  public int hash(String value) {

  int result = ;

  int len = valuelength();

  for (int i = ; i < len; i++) {

  result = seed * result + valuecharAt(i);

  }

  return (cap ) & result;

  }

  }

  @Override

  public String toString() {

  // TODO Autogenerated method stub

  return supertoString();

  }

  }

  UtilSerizjava

  view plaincopy to clipboardprint?

  ····················································································································package comhengkingcrawl;

  import javaio*;

  public class UtilSeriz

  {

      /**

       *將對象序列化到磁盤文件中

       *@param

       *@throwsException

       */

      public static  void writeObject(Object oString strPath) throws Exception{

         File f=new File(strPath);

         if(fexists()){

             fdelete();

         }

         FileOutputStream os=new FileOutputStream(f);

         //ObjectOutputStream 核心類

         ObjectOutputStream oos=new ObjectOutputStream(os);

         ooswriteObject(o);

         oosclose();

         osclose();

      }

  

      /**

       *反序列化將磁盤文件轉化為對象

       *@paramf

       *@return

       *@throwsException

       */

      public static Object readObject(String strPath) throws Exception{

          File f=new File(strPath);

          if(!fexists())

          {

              return null;

          }

         InputStream is=new FileInputStream(f);

         //ObjectOutputStream 核心類

         ObjectInputStream ois=new ObjectInputStream(is);

         return oisreadObject();

      }

  

  }

  package comhengkingcrawl;

  import javaio*;

  public class UtilSeriz

  {

  /**

  *將對象序列化到磁盤文件中

  *@param

  *@throwsException

  */

  public static  void writeObject(Object oString strPath) throws Exception{

  File f=new File(strPath);

  if(fexists()){

  fdelete();

  }

  FileOutputStream os=new FileOutputStream(f);

  //ObjectOutputStream 核心類

  ObjectOutputStream oos=new ObjectOutputStream(os);

  ooswriteObject(o);

  oosclose();

  osclose();

  }

  /**

  *反序列化將磁盤文件轉化為對象

  *@paramf

  *@return

  *@throwsException

  */

  public static Object readObject(String strPath) throws Exception{

  File f=new File(strPath);

  if(!fexists())

  {

  return null;

  }

  InputStream is=new FileInputStream(f);

  //ObjectOutputStream 核心類

  ObjectInputStream ois=new ObjectInputStream(is);

  return oisreadObject();

  }

  }

  SearchCrawlerjava

  view plaincopy to clipboardprint?

  ····················································································································package comhengkingcrawl;

  import javaawtimageBufferedImage;

  import javaioBufferedInputStream;

  import javaioBufferedReader;

  import javaioBufferedWriter;

  import javaioFile;

  import javaioFileOutputStream;

  import javaioFileWriter;

  import javaioIOException;

  import javaioInputStreamReader;

  import URL;

  import javatextSimpleDateFormat;

  import javautilArrayList;

  import javautilCalendar;

  import javautilDate;

  import javautilHashMap;

  import javautilLinkedHashSet;

  import ncurrentCallable;

  import ncurrentExecutorService;

  import ncurrentExecutors;

  import ncurrentSemaphore;

  import javautilregexMatcher;

  import javautilregexPattern;

  import javaximageioImageIO;

  import comhengkingcrawlpoPoCalSearch;

  import comhengkingcrawlpoPoDownload;

  

  

  /***

   * 說明:抓圖工具

   * @author 君望永遠

   *

   */

  public class SearchCrawler implements Runnable{

  

      /* disallowListCache緩存robot不允許搜索的URL Robot協議在Web站點的根目錄下設置一個robotstxt文件

       *規定站點上的哪些頁面是限制搜索的 搜索程序應該在搜索過程中跳過這些區域下面是robotstxt的一個例子:

      # robotstxt for

      Useragent: *

      Disallow: /cgibin/

      Disallow: /registration # /Disallow robots on registration page

      Disallow: /login

      */

      public static SimpleBloomFilter  filterUrl;

      public static  SimpleBloomFilter filterImg;

      private HashMap< StringArrayList< String>> disallowListCache = new HashMap< StringArrayList< String>>();

      ArrayList< String> errorList= new ArrayList< String>();//錯誤信息

      ArrayList< String> result=new ArrayList< String>(); //搜索到的結果

      String startUrl;//開始搜索的起點

      LinkedHashSet<String> toCrawlList = new LinkedHashSet<String>();

      boolean caseSensitive=false;//是否區分大小寫

      boolean limitHost=false;//是否在限制的主機內搜索

      private static String outdir;

  

      private static String seroutdir;

      private static String seroutdirimg;

      private boolean blnFlag=false;

  

      private static  PoCalSearch ps=null;

      private static PoDownload pd=null;

  

      //個圖片分析線程

      private static ExecutorService   execImg;

      final Semaphore sempImg = new Semaphore();

  

      //個網頁分析線程

      private static ExecutorService   execPage;

      final Semaphore sempPage = new Semaphore();

  

      private ArrayList<ParsePage> arrPar=new ArrayList<ParsePage>();

  

      //記錄抓圖結果

      private static BufferedWriter bw = null;

  

    public SearchCrawler(String startUrl)

    {

     thisstartUrl=startUrl;

  

    }

     public ArrayList< String> getResult(){

         return result;

     }

    public void run(){//啟動搜索線程

         new Thread(new TimeWriteFile())start();

         blnFlag=true;

         crawl(startUrllimitHostcaseSensitive);

  

    }

  

      //檢測URL格式

    private URL verifyUrl(String url) {

      // 只處理HTTP URLs

      if (!urltoLowerCase()startsWith(//))

        return null;

      URL verifiedUrl = null;

      try {

        verifiedUrl = new URL(url);

      } catch (Exception e) {

        return null;

      }

      return verifiedUrl;

    }

    // 檢測robot是否允許訪問給出的URL

   private boolean isRobotAllowed(URL urlToCheck) {

      String host = urlToCheckgetHost()toLowerCase();//獲取給出RUL的主機

      //Systemoutprintln(主機=+host);

      // 獲取主機不允許搜索的URL緩存

      ArrayList< String> disallowList =disallowListCacheget(host);

      // 如果還沒有緩存下載並緩存

      if (disallowList == null) {

        disallowList = new ArrayList< String>();

        try {

          URL robotsFileUrl =new URL(// + host + /robotstxt);

          BufferedReader reader =new BufferedReader(new InputStreamReader(robotsFileUrlopenStream()));

          // 讀robot文件創建不允許訪問的路徑列表

          String line;

          while ((line = readerreadLine()) != null) {

            if (lineindexOf(Disallow:) == ) {//是否包含Disallow:

              String disallowPath =linesubstring(Disallow:length());//獲取不允許訪問路徑

              // 檢查是否有注釋

              int commentIndex = disallowPathindexOf(#);

              if (commentIndex != ) {

                disallowPath =disallowPathsubstring( commentIndex);//去掉注釋

              }

  

              disallowPath = disallowPathtrim();

              disallowListadd(disallowPath);

             }

           }

          // 緩存此主機不允許訪問的路徑

          disallowListCacheput(host disallowList);

        } catch (Exception e) {

                return true; //web站點根目錄下沒有robotstxt文件返回真

        }

      }

  

      String file = urlToCheckgetFile();

      //Systemoutprintln(文件getFile()=+file);

      for (int i = ; i < disallowListsize(); i++) {

        String disallow = disallowListget(i);

        if (filestartsWith(disallow)) {

          return false;

        }

      }

      return true;

    }

  

  

    private String downloadPage(URL pageUrl) {

       try {

  

  

  

           // Open connection to URL for reading

           BufferedReader reader =

            new BufferedReader(new InputStreamReader(pageUrlopenStream()));

  

  

          // Read page into buffer

          String line;

          StringBuffer pageBuffer = new StringBuffer();

          while ((line = readerreadLine()) != null) {

            pageBufferappend(line);

          }

  

          return pageBuffertoString();

       } catch (Exception e) {

           eprintStackTrace();

       }

       return null;

    }

    // 從URL中去掉www

    private String removeWwwFromUrl(String url) {

      int index = urlindexOf(://www);

      if (index != ) {

        return urlsubstring( index + ) +

          urlsubstring(index + );

      }

      return (url);

    }

    // 解析頁面並找出鏈接

    private ArrayList< String> retrieveLinks(URL pageUrl String pageContents

      boolean limitHost)

    {

      // 用正則表達式編譯鏈接的匹配模式

      Pattern p =pile(<a\\s+href\\s*=\\s*\?(*?)[\|>]PatternCASE_INSENSITIVE);

      Matcher m = pmatcher(pageContents);

  

      ArrayList< String> linkList = new ArrayList< String>();

      while (mfind()) {

        String link = mgroup()trim();

  

        if (linklength() < ) {

          continue;

        }

        // 跳過鏈到本頁面內鏈接

        if (linkcharAt() == #) {

          continue;

        }

  

        if (linkindexOf(mailto:) != ) {

          continue;

        }

  

        if (linktoLowerCase()indexOf(javascript) != ) {

          continue;

        }

        if (linkindexOf(://) == ){

          if (linkcharAt() == /) {//處理絕對地

            link = // + pageUrlgetHost()+:+pageUrlgetPort()+ link;

          } else {

            String file = pageUrlgetFile();

            if (fileindexOf(/) == ) {//處理相對地址

              link = // + pageUrlgetHost()+:+pageUrlgetPort() + / + link;

            } else {

              String path =filesubstring( filelastIndexOf(/) + );

              link = // + pageUrlgetHost() +:+pageUrlgetPort()+ path + link;

            }

          }

        }

        int index = linkindexOf(#);

        if (index != ) {

          link = linksubstring( index);

        }

        link = removeWwwFromUrl(link);

        URL verifiedLink = verifyUrl(link);

        if (verifiedLink == null) {

          continue;

        }

        /* 如果限定主機排除那些不合條件的URL*/

        if (limitHost &&

            !pageUrlgetHost()toLowerCase()equals(

              verifiedLinkgetHost()toLowerCase()))

        {

          continue;

        }

        // 跳過那些已經處理的鏈接

        if(ntains(link))

        {

            logEvent(匹配了:+link);

            continue;

        }

        else

        {

            filterUrladd(link);

        }

  

         linkListadd(link);

      }

     return (linkList);

    }

  

  

  

  

    // 解析頁面並找出鏈接

    private ArrayList< String> retrieveImgLinks(URL pageUrl String pageContents

      boolean limitHost)

    {

      // 用正則表達式編譯鏈接的匹配模式

      Pattern p =pile(<img\\s+src\\s*=\\s*\?(*?)[\|>]PatternCASE_INSENSITIVE);

      Matcher m = pmatcher(pageContents);

  

      ArrayList< String> linkList = new ArrayList< String>();

      while (mfind()) {

        String link = mgroup()trim();

  

        if (linklength() < ) {

          continue;

        }

        // 跳過鏈到本頁面內鏈接

        if (linkcharAt() == #) {

          continue;

        }

  

        if (linkindexOf(mailto:) != ) {

          continue;

        }

  

        if (linktoLowerCase()indexOf(javascript) != ) {

          continue;

        }

        if (linktoLowerCase()endsWith(gif)) {

            continue;

          }

        if (linkindexOf(://) == )

        {

          if (linkcharAt() == /)

          {//處理絕對地

            link = // + pageUrlgetHost()+:+pageUrlgetPort()+ link;

          }

          else

          {

            String file = pageUrlgetFile();

            if (fileindexOf(/) == ) {//處理相對地址

              link = // + pageUrlgetHost()+:+pageUrlgetPort() + / + link;

            } else {

              String path =filesubstring( filelastIndexOf(/) + );

              link = // + pageUrlgetHost() +:+pageUrlgetPort()+ path + link;

            }

          }

        }

        int index = linkindexOf(#);

        if (index != ) {

          link = linksubstring( index);

        }

        link = removeWwwFromUrl(link);

        URL verifiedLink = verifyUrl(link);

        if (verifiedLink == null) {

          continue;

        }

        /* 如果限定主機排除那些不合條件的URL*/

        if (limitHost &&

            !pageUrlgetHost()toLowerCase()equals(

              verifiedLinkgetHost()toLowerCase()))

        {

          continue;

        }

        // 跳過那些已經處理的鏈接

  //      if (ntains(link)) {

  //        continue;

  //      }

        if(ntains(link))

        {

            logEvent(圖片匹配了:+link);

            continue;

        }

        else

        {

            filterImgadd(link);

        }

  

  

        if(linklastIndexOf(gif)==)

        {

            linkListadd(link);

        }

  

  

  

      }

     return (linkList);

    }

  

      //執行實際的搜索操作

      public ArrayList< String> crawl(String startUrlboolean limithostboolean caseSensitive )

      {

  

          // 從開始URL中移出www

          startUrl = removeWwwFromUrl(startUrl);

  

          toCrawlListadd(startUrl);

  

          int idxPageParse=;

          while (toCrawlListsize()>)

          {

              try

              {

                  idxPageParse++;

                  // Get URL at bottom of the list

                  String url =  erator()next();

                  pssetIntUrl(psgetIntUrl()+);

                  // Remove URL from the to crawl list

                  toCrawlListremove(url);

  

                  int intRetryPage=;

                  while (sempPageavailablePermits()<=)

                  {

                      Systemoutprintln(暫時沒有空閒的網頁分析線程等待秒再執行);

                      try {

                          intRetryPage++;

                          if(intRetryPage==)

                          {

                              logEvent(分析網頁+url+超時);

                              sempPagerelease();

                              break;

                          }

                          Threadsleep();

                      } catch (InterruptedException e) {

                          eprintStackTrace();

                      }

                  }

  

  

                  ParsePage tempPageThread=new ParsePage(url);

                  execPagesubmit(tempPageThread);

                  logEvent(開啟網頁分析線程+idxPageParse);

                  if(idxPageParse==)

                  {

                      ThreadcurrentThread()sleep();

                  }

  

              }catch(Exception e)

              {

                  eprintStackTrace();

              }

  

          }

          blnFlag=false;

  

          logEvent(抓圖完成);

  

          return result;

      }

    public static  void logEvent(String strLog)

    {

        Systemoutprintln( new SimpleDateFormat(yyyy年MM月dd日HH時mm分ss秒)format(new Date(CalendargetInstance()getTimeInMillis()))+=====>+strLog);

  

    }

  

    // 主函數

    public static void main(String[] args) {

       if(argslength!=)

       {

          Systemoutprintln(Usage:java SearchCrawler startUrl maxUrl searchString);

          return;

       }

      @SuppressWarnings(unused)

      String strLogPath=args[];

      SearchCrawler crawler = new SearchCrawler(args[]);

  

  

      outdir=args[]+/pic+new SimpleDateFormat(yyyyMMdd)format(new Date(CalendargetInstance()getTimeInMillis()))+/;

      File f=new File(outdir);

      if(!fexists())

      {

          fmkdir();

      }

  

      execPage   =   ExecutorsnewFixedThreadPool();

      execImg   =   ExecutorsnewFixedThreadPool();

  

      seroutdir=args[];

      seroutdirimg=args[];

  

      ps=new PoCalSearch();

      pd=new PoDownload();

      try {

          if(UtilSerizreadObject(seroutdir)!=null)

          {

              Systemoutprintln(new SimpleDateFormat(yyyy年MM月dd日HH時mm分ss秒)format(new Date(CalendargetInstance()getTimeInMillis()))+=====>+反序列化URL);

              filterUrl=(SimpleBloomFilter)UtilSerizreadObject(seroutdir);

          }

          else

          {

              filterUrl=new SimpleBloomFilter();

          }

          if(UtilSerizreadObject(seroutdir)!=null)

          {

              Systemoutprintln(new SimpleDateFormat(yyyy年MM月dd日HH時mm分ss秒)format(new Date(CalendargetInstance()getTimeInMillis()))+=====>+反序列化圖片);

  

              filterImg=(SimpleBloomFilter)UtilSerizreadObject(seroutdirimg);

          }

          else

          {

              filterImg=new SimpleBloomFilter();

          }

      } catch (Exception e) {

          eprintStackTrace();

      }

  

      String strPic=args[]+/pic+new SimpleDateFormat(yyyyMMdd)format(new Date(CalendargetInstance()getTimeInMillis()))+log;

      try {

          bw=new BufferedWriter(new FileWriter(strPicfalse));

      } catch (IOException e) {

          // TODO Autogenerated catch block

          eprintStackTrace();

      }

  

      Thread  search=new Thread(crawler);

      Systemoutprintln( new SimpleDateFormat(yyyy年MM月dd日HH時mm分ss秒)format(new Date(CalendargetInstance()getTimeInMillis()))+=====>+開始爬圖);

      Systemoutprintln(下載了圖:);

      searchstart();

      try {

          searchjoin();

          logEvent(主函數結束);

          bwclose();

      } catch (Exception e) {

          // TODO Autogenerated catch block

          eprintStackTrace();

      }

  

  

  

  

    }

  

    /**

     * 說明:下載圖片的線程

     * @author binbin

     *

     */

    public class ImgDownThread implements RunnableCallable<Long>{

        //待下載的URL

        private String stru;

  

          private boolean isStart=true;

  

          public ImgDownThread(String strurl) {

              super();

              thisstru = strurl;

          }

  

          @Override

          public void run()

          {

  

  

              try

              {

                      sempImgacquire();

                      try{

  

  

                      URL url=new URL(stru);

                      BufferedInputStream  in = new BufferedInputStream(urlopenStream());

  

                      BufferedImage bi=ImageIOread(urlopenStream());

  

                      //尺寸要求

                      if (bi==null|| bigetWidth()< || bigetHeight()<  )

                      {

  

                          inclose();

                          return;

                      }

                      String ss=new SimpleDateFormat(yyyyMMddHHmmss)format(new Date(CalendargetInstance()getTimeInMillis()))+_+Mathround(Mathrandom()*L+)+strusubstring(strulastIndexOf());

                      String s=outdir+ss;

                      FileOutputStream   file = new FileOutputStream(new File(s));

                      int t;

                      while ((t = inread()) != )

                      {

                          filewrite(t);

                      }

                      fileclose();

                      if(new File(s)length()<=*)

                      {

  

                          inclose();

                          new File(s)delete();

                          return;

                      }

  

                      synchronized(bw)

                      {

                          String str=ss+:+stru;

                          bwwrite(str);

                          bwnewLine();

                          bwflush();

                      }

                      logEvent(下載了:+stru);

                      pssetIntImg(psgetIntImg()+);

                      inclose();

  

                      }catch(Exception e){

                           logEvent(**********************下載圖片:+stru+超時);

                      }

              }

  

  

              catch (Exception e)

              {

  

                  eprintStackTrace();

              }

              finally{

                  sempImgrelease();

              }

  

          }

  

  

          public boolean isStart() {

              return isStart;

          }

  

          public void setStart(boolean isStart) {

              thisisStart = isStart;

          }

  

          @Override

          public Long call() throws Exception {

              try

              {

                      sempImgacquire();

                      try{

  

  

                      URL url=new URL(stru);

                      BufferedInputStream  in = new BufferedInputStream(urlopenStream());

  

                      BufferedImage bi=ImageIOread(urlopenStream());

  

                      //尺寸要求

                      if (bi==null|| bigetWidth()< || bigetHeight()<  )

                      {

  

                          inclose();

                          return l;

                      }

                      String ss=new SimpleDateFormat(yyyyMMddHHmmss)format(new Date(CalendargetInstance()getTimeInMillis()))+_+Mathround(Mathrandom()*L+)+strusubstring(strulastIndexOf());

                      String s=outdir+ss;

                      FileOutputStream   file = new FileOutputStream(new File(s));

                      int t;

                      while ((t = inread()) != )

                      {

                          filewrite(t);

                      }

                      fileclose();

                      if(new File(s)length()<=*)

                      {

  

                          inclose();

                          new File(s)delete();

                          return l;

                      }

  

                      logEvent(下載了:+stru);

                      pssetIntImg(psgetIntImg()+);

                      inclose();

  

                      }catch(Exception e){

                           logEvent(**********************下載圖片:+stru+超時);

                      }

              }

  

  

              catch (Exception e)

              {

  

                  eprintStackTrace();

              }

              finally{

                  sempImgrelease();

                  return l;

              }

  

          }

  

    }

  

    /***

     * 序列化已訪問的URL

     * @author binbin

     *

     */

    public class TimeWriteFile implements Runnable

    {

        @Override

        public void run()

        {

            while(blnFlag)

            {

                try

                {

  

                    synchronized(ps)

                    {

                        logEvent(開始序列化URL);

                        UtilSerizwriteObject(filterUrlseroutdir);

                        logEvent(結束序列化URL);

                        logEvent(開始序列化圖片);

                        UtilSerizwriteObject(filterImgseroutdirimg);

                        logEvent(結束序列化圖片);

                        logEvent(分析了+psgetIntUrl()+個鏈接);

                        logEvent(下載了+psgetIntImg()+張圖片);

                    }

                        Threadsleep();

  

                }

                catch (Exception e)

                {

                    eprintStackTrace();

                }

  

            }

  

        }

  

      }

  

  

      /***

       * 分析對應URL網頁的線程

       * @author Administrator

       *

       */

      class ParsePage extends Thread

      {

  

          String url;

          int iCount=;

  

          public int getiCount() {

              return iCount;

          }

          public void setiCount(int iCount) {

              thisiCount = iCount;

          }

          public String getUrl()

          {

              return url;

          }

          public void setUrl(String url)

          {

              thisurl = url;

          }

          public ParsePage(String url) {

              thisurl=url;

          }

          @Override

          public void run()

          {

              try

              {

                  sempPageacquire();

                  // Convert string url to URL object

                  URL verifiedUrl = verifyUrl(url);

  

                  // Skip URL if robots are not allowed to access it

                  if (!isRobotAllowed(verifiedUrl))

                  {

                      ThreadcurrentThread()stop();

                  }

  

  

                  // 增加已處理的URL到crawledList

                  String pageContents=;

  

                  pageContents = downloadPage(verifiedUrl);

  

                  logEvent(分析了:+verifiedUrl);

                  logEvent(待分析URL數:+toCrawlListsize()+);

  

  

                  if (pageContents != null && pageContentslength() > )

                  {

                      // 從頁面中獲取有效的鏈接

                      ArrayList< String> links =retrieveLinks(verifiedUrl pageContentslimitHost);

  

                      // 從頁面中獲取有效的鏈接

                      ArrayList< String> imglinks =retrieveImgLinks(verifiedUrl pageContentslimitHost);

  

                      //添加到圖片下載隊列

                      if(toCrawlListsize()<)

                      {

                          toCrawlListaddAll(links);

                      }

                      else

                      {

                          logEvent(待分析的網頁URL超過!!!!跳過);

                      }

  

  

  

                      for(int i=;i<imglinkssize();i++)

                      {

                          if(imglinksget(i)indexOf()!=)

                          {

  

                              iCount++;

                              filterImgadd(imglinksget(i));

                              pssetIntImg(psgetIntImg()+);

  

  

                              int intRetryImg=;

                              while (sempImgavailablePermits() <= )

                              {

                                  Systemoutprintln(暫時沒有空閒的抓圖線程等待秒再執行);

                                  try {

                                      intRetryImg++;

                                      if(intRetryImg==)

                                      {

                                          logEvent(抓圖+imglinksget(i)+超時);

                                          sempImgrelease();

                                      }

                                      Threadsleep();

                                  } catch (InterruptedException e) {

                                      eprintStackTrace();

                                  }

                              }

                              Thread tempImgThread=new Thread(new ImgDownThread(imglinksget(i)));

                              execImgsubmit(tempImgThread);

  

                              if((iCount!=) && (iCount%==) )

                              {

                                  try

                                  {

                                      logEvent(圖多休息);

                                      ThreadcurrentThread()sleep();

                                  }

                                  catch (InterruptedException e)

                                  {

                                      eprintStackTrace();

                                  }

                              }

  

  

                          }

  

                      }

                  }

                  synchronized(arrPar)

                  {

                      arrParremove(this);

                  }

              }

              catch(Exception e)

              {

                  eprintStackTrace();

  

              }

              finally

              {

                  sempPagerelease();

              }

          }

  

      }

  }


From:http://tw.wingwit.com/Article/program/Java/hx/201311/25795.html
    推薦文章
    Copyright © 2005-2013 電腦知識網 Computer Knowledge   All rights reserved.