主要難點:
並發線程的控制 采用了JDK的UTIL包裡的concurrent子包
去重
序列化
運行方法:java XmsM XmxM jar JavaCrawlerjar C:/alog D:/pic D:/urltmp D:/imgtmp
SimpleBloomFilterjava
view plaincopy to clipboardprint?
····················································································································package comhengkingcrawl;
import javaioSerializable;
import javautilBitSet;
public class SimpleBloomFilter implements Serializable {
/**
*
*/
private static final long serialVersionUID = L;
private final int DEFAULT_SIZE = << ;
private final int[] seeds = new int[] { };
private BitSet bits = new BitSet(DEFAULT_SIZE);
private SimpleHash[] func = new SimpleHash[seedslength];
// public void main(String[] args) {
// String value = ;
// SimpleBloomFilter filter = new SimpleBloomFilter();
// Systemoutprintln(ntains(value));
// filteradd(value);
// Systemoutprintln(ntains(value));
// }
public SimpleBloomFilter() {
for (int i = ; i < seedslength; i++) {
func[i] = new SimpleHash(DEFAULT_SIZE seeds[i]);
}
}
public void add(String value) {
for (SimpleHash f : func) {
bitsset(fhash(value) true);
}
}
public boolean contains(String value) {
if (value == null) {
return false;
}
boolean ret = true;
for (SimpleHash f : func) {
ret = ret && bitsget(fhash(value));
}
return ret;
}
public class SimpleHash implements Serializable {
private int cap;
private int seed;
public SimpleHash(int cap int seed) {
thiscap = cap;
thisseed = seed;
}
public int hash(String value) {
int result = ;
int len = valuelength();
for (int i = ; i < len; i++) {
result = seed * result + valuecharAt(i);
}
return (cap ) & result;
}
}
@Override
public String toString() {
// TODO Autogenerated method stub
return supertoString();
}
}
package comhengkingcrawl;
import javaioSerializable;
import javautilBitSet;
public class SimpleBloomFilter implements Serializable {
/**
*
*/
private static final long serialVersionUID = L;
private final int DEFAULT_SIZE = << ;
private final int[] seeds = new int[] { };
private BitSet bits = new BitSet(DEFAULT_SIZE);
private SimpleHash[] func = new SimpleHash[seedslength];
// public void main(String[] args) {
// String value = ;
// SimpleBloomFilter filter = new SimpleBloomFilter();
// Systemoutprintln(ntains(value));
// filteradd(value);
// Systemoutprintln(ntains(value));
// }
public SimpleBloomFilter() {
for (int i = ; i < seedslength; i++) {
func[i] = new SimpleHash(DEFAULT_SIZE seeds[i]);
}
}
public void add(String value) {
for (SimpleHash f : func) {
bitsset(fhash(value) true);
}
}
public boolean contains(String value) {
if (value == null) {
return false;
}
boolean ret = true;
for (SimpleHash f : func) {
ret = ret && bitsget(fhash(value));
}
return ret;
}
public class SimpleHash implements Serializable {
private int cap;
private int seed;
public SimpleHash(int cap int seed) {
thiscap = cap;
thisseed = seed;
}
public int hash(String value) {
int result = ;
int len = valuelength();
for (int i = ; i < len; i++) {
result = seed * result + valuecharAt(i);
}
return (cap ) & result;
}
}
@Override
public String toString() {
// TODO Autogenerated method stub
return supertoString();
}
}
UtilSerizjava
view plaincopy to clipboardprint?
····················································································································package comhengkingcrawl;
import javaio*;
public class UtilSeriz
{
/**
*將對象序列化到磁盤文件中
*@param
*@throwsException
*/
public static void writeObject(Object oString strPath) throws Exception{
File f=new File(strPath);
if(fexists()){
fdelete();
}
FileOutputStream os=new FileOutputStream(f);
//ObjectOutputStream 核心類
ObjectOutputStream oos=new ObjectOutputStream(os);
ooswriteObject(o);
oosclose();
osclose();
}
/**
*反序列化將磁盤文件轉化為對象
*@paramf
*@return
*@throwsException
*/
public static Object readObject(String strPath) throws Exception{
File f=new File(strPath);
if(!fexists())
{
return null;
}
InputStream is=new FileInputStream(f);
//ObjectOutputStream 核心類
ObjectInputStream ois=new ObjectInputStream(is);
return oisreadObject();
}
}
package comhengkingcrawl;
import javaio*;
public class UtilSeriz
{
/**
*將對象序列化到磁盤文件中
*@param
*@throwsException
*/
public static void writeObject(Object oString strPath) throws Exception{
File f=new File(strPath);
if(fexists()){
fdelete();
}
FileOutputStream os=new FileOutputStream(f);
//ObjectOutputStream 核心類
ObjectOutputStream oos=new ObjectOutputStream(os);
ooswriteObject(o);
oosclose();
osclose();
}
/**
*反序列化將磁盤文件轉化為對象
*@paramf
*@return
*@throwsException
*/
public static Object readObject(String strPath) throws Exception{
File f=new File(strPath);
if(!fexists())
{
return null;
}
InputStream is=new FileInputStream(f);
//ObjectOutputStream 核心類
ObjectInputStream ois=new ObjectInputStream(is);
return oisreadObject();
}
}
SearchCrawlerjava
view plaincopy to clipboardprint?
····················································································································package comhengkingcrawl;
import javaawtimageBufferedImage;
import javaioBufferedInputStream;
import javaioBufferedReader;
import javaioBufferedWriter;
import javaioFile;
import javaioFileOutputStream;
import javaioFileWriter;
import javaioIOException;
import javaioInputStreamReader;
import URL;
import javatextSimpleDateFormat;
import javautilArrayList;
import javautilCalendar;
import javautilDate;
import javautilHashMap;
import javautilLinkedHashSet;
import ncurrentCallable;
import ncurrentExecutorService;
import ncurrentExecutors;
import ncurrentSemaphore;
import javautilregexMatcher;
import javautilregexPattern;
import javaximageioImageIO;
import comhengkingcrawlpoPoCalSearch;
import comhengkingcrawlpoPoDownload;
/***
* 說明:抓圖工具
* @author 君望永遠
*
*/
public class SearchCrawler implements Runnable{
/* disallowListCache緩存robot不允許搜索的URL Robot協議在Web站點的根目錄下設置一個robotstxt文件
*規定站點上的哪些頁面是限制搜索的 搜索程序應該在搜索過程中跳過這些區域下面是robotstxt的一個例子:
# robotstxt for
Useragent: *
Disallow: /cgibin/
Disallow: /registration # /Disallow robots on registration page
Disallow: /login
*/
public static SimpleBloomFilter filterUrl;
public static SimpleBloomFilter filterImg;
private HashMap< StringArrayList< String>> disallowListCache = new HashMap< StringArrayList< String>>();
ArrayList< String> errorList= new ArrayList< String>();//錯誤信息
ArrayList< String> result=new ArrayList< String>(); //搜索到的結果
String startUrl;//開始搜索的起點
LinkedHashSet<String> toCrawlList = new LinkedHashSet<String>();
boolean caseSensitive=false;//是否區分大小寫
boolean limitHost=false;//是否在限制的主機內搜索
private static String outdir;
private static String seroutdir;
private static String seroutdirimg;
private boolean blnFlag=false;
private static PoCalSearch ps=null;
private static PoDownload pd=null;
//個圖片分析線程
private static ExecutorService execImg;
final Semaphore sempImg = new Semaphore();
//個網頁分析線程
private static ExecutorService execPage;
final Semaphore sempPage = new Semaphore();
private ArrayList<ParsePage> arrPar=new ArrayList<ParsePage>();
//記錄抓圖結果
private static BufferedWriter bw = null;
public SearchCrawler(String startUrl)
{
thisstartUrl=startUrl;
}
public ArrayList< String> getResult(){
return result;
}
public void run(){//啟動搜索線程
new Thread(new TimeWriteFile())start();
blnFlag=true;
crawl(startUrllimitHostcaseSensitive);
}
//檢測URL格式
private URL verifyUrl(String url) {
// 只處理HTTP URLs
if (!urltoLowerCase()startsWith(//))
return null;
URL verifiedUrl = null;
try {
verifiedUrl = new URL(url);
} catch (Exception e) {
return null;
}
return verifiedUrl;
}
// 檢測robot是否允許訪問給出的URL
private boolean isRobotAllowed(URL urlToCheck) {
String host = urlToCheckgetHost()toLowerCase();//獲取給出RUL的主機
//Systemoutprintln(主機=+host);
// 獲取主機不允許搜索的URL緩存
ArrayList< String> disallowList =disallowListCacheget(host);
// 如果還沒有緩存下載並緩存
if (disallowList == null) {
disallowList = new ArrayList< String>();
try {
URL robotsFileUrl =new URL(// + host + /robotstxt);
BufferedReader reader =new BufferedReader(new InputStreamReader(robotsFileUrlopenStream()));
// 讀robot文件創建不允許訪問的路徑列表
String line;
while ((line = readerreadLine()) != null) {
if (lineindexOf(Disallow:) == ) {//是否包含Disallow:
String disallowPath =linesubstring(Disallow:length());//獲取不允許訪問路徑
// 檢查是否有注釋
int commentIndex = disallowPathindexOf(#);
if (commentIndex != ) {
disallowPath =disallowPathsubstring( commentIndex);//去掉注釋
}
disallowPath = disallowPathtrim();
disallowListadd(disallowPath);
}
}
// 緩存此主機不允許訪問的路徑
disallowListCacheput(host disallowList);
} catch (Exception e) {
return true; //web站點根目錄下沒有robotstxt文件返回真
}
}
String file = urlToCheckgetFile();
//Systemoutprintln(文件getFile()=+file);
for (int i = ; i < disallowListsize(); i++) {
String disallow = disallowListget(i);
if (filestartsWith(disallow)) {
return false;
}
}
return true;
}
private String downloadPage(URL pageUrl) {
try {
// Open connection to URL for reading
BufferedReader reader =
new BufferedReader(new InputStreamReader(pageUrlopenStream()));
// Read page into buffer
String line;
StringBuffer pageBuffer = new StringBuffer();
while ((line = readerreadLine()) != null) {
pageBufferappend(line);
}
return pageBuffertoString();
} catch (Exception e) {
eprintStackTrace();
}
return null;
}
// 從URL中去掉www
private String removeWwwFromUrl(String url) {
int index = urlindexOf(://www);
if (index != ) {
return urlsubstring( index + ) +
urlsubstring(index + );
}
return (url);
}
// 解析頁面並找出鏈接
private ArrayList< String> retrieveLinks(URL pageUrl String pageContents
boolean limitHost)
{
// 用正則表達式編譯鏈接的匹配模式
Pattern p =pile(<a\\s+href\\s*=\\s*\?(*?)[\|>]PatternCASE_INSENSITIVE);
Matcher m = pmatcher(pageContents);
ArrayList< String> linkList = new ArrayList< String>();
while (mfind()) {
String link = mgroup()trim();
if (linklength() < ) {
continue;
}
// 跳過鏈到本頁面內鏈接
if (linkcharAt() == #) {
continue;
}
if (linkindexOf(mailto:) != ) {
continue;
}
if (linktoLowerCase()indexOf(javascript) != ) {
continue;
}
if (linkindexOf(://) == ){
if (linkcharAt() == /) {//處理絕對地
link = // + pageUrlgetHost()+:+pageUrlgetPort()+ link;
} else {
String file = pageUrlgetFile();
if (fileindexOf(/) == ) {//處理相對地址
link = // + pageUrlgetHost()+:+pageUrlgetPort() + / + link;
} else {
String path =filesubstring( filelastIndexOf(/) + );
link = // + pageUrlgetHost() +:+pageUrlgetPort()+ path + link;
}
}
}
int index = linkindexOf(#);
if (index != ) {
link = linksubstring( index);
}
link = removeWwwFromUrl(link);
URL verifiedLink = verifyUrl(link);
if (verifiedLink == null) {
continue;
}
/* 如果限定主機排除那些不合條件的URL*/
if (limitHost &&
!pageUrlgetHost()toLowerCase()equals(
verifiedLinkgetHost()toLowerCase()))
{
continue;
}
// 跳過那些已經處理的鏈接
if(ntains(link))
{
logEvent(匹配了:+link);
continue;
}
else
{
filterUrladd(link);
}
linkListadd(link);
}
return (linkList);
}
// 解析頁面並找出鏈接
private ArrayList< String> retrieveImgLinks(URL pageUrl String pageContents
boolean limitHost)
{
// 用正則表達式編譯鏈接的匹配模式
Pattern p =pile(<img\\s+src\\s*=\\s*\?(*?)[\|>]PatternCASE_INSENSITIVE);
Matcher m = pmatcher(pageContents);
ArrayList< String> linkList = new ArrayList< String>();
while (mfind()) {
String link = mgroup()trim();
if (linklength() < ) {
continue;
}
// 跳過鏈到本頁面內鏈接
if (linkcharAt() == #) {
continue;
}
if (linkindexOf(mailto:) != ) {
continue;
}
if (linktoLowerCase()indexOf(javascript) != ) {
continue;
}
if (linktoLowerCase()endsWith(gif)) {
continue;
}
if (linkindexOf(://) == )
{
if (linkcharAt() == /)
{//處理絕對地
link = // + pageUrlgetHost()+:+pageUrlgetPort()+ link;
}
else
{
String file = pageUrlgetFile();
if (fileindexOf(/) == ) {//處理相對地址
link = // + pageUrlgetHost()+:+pageUrlgetPort() + / + link;
} else {
String path =filesubstring( filelastIndexOf(/) + );
link = // + pageUrlgetHost() +:+pageUrlgetPort()+ path + link;
}
}
}
int index = linkindexOf(#);
if (index != ) {
link = linksubstring( index);
}
link = removeWwwFromUrl(link);
URL verifiedLink = verifyUrl(link);
if (verifiedLink == null) {
continue;
}
/* 如果限定主機排除那些不合條件的URL*/
if (limitHost &&
!pageUrlgetHost()toLowerCase()equals(
verifiedLinkgetHost()toLowerCase()))
{
continue;
}
// 跳過那些已經處理的鏈接
// if (ntains(link)) {
// continue;
// }
if(ntains(link))
{
logEvent(圖片匹配了:+link);
continue;
}
else
{
filterImgadd(link);
}
if(linklastIndexOf(gif)==)
{
linkListadd(link);
}
}
return (linkList);
}
//執行實際的搜索操作
public ArrayList< String> crawl(String startUrlboolean limithostboolean caseSensitive )
{
// 從開始URL中移出www
startUrl = removeWwwFromUrl(startUrl);
toCrawlListadd(startUrl);
int idxPageParse=;
while (toCrawlListsize()>)
{
try
{
idxPageParse++;
// Get URL at bottom of the list
String url = erator()next();
pssetIntUrl(psgetIntUrl()+);
// Remove URL from the to crawl list
toCrawlListremove(url);
int intRetryPage=;
while (sempPageavailablePermits()<=)
{
Systemoutprintln(暫時沒有空閒的網頁分析線程等待秒再執行);
try {
intRetryPage++;
if(intRetryPage==)
{
logEvent(分析網頁+url+超時);
sempPagerelease();
break;
}
Threadsleep();
} catch (InterruptedException e) {
eprintStackTrace();
}
}
ParsePage tempPageThread=new ParsePage(url);
execPagesubmit(tempPageThread);
logEvent(開啟網頁分析線程+idxPageParse);
if(idxPageParse==)
{
ThreadcurrentThread()sleep();
}
}catch(Exception e)
{
eprintStackTrace();
}
}
blnFlag=false;
logEvent(抓圖完成);
return result;
}
public static void logEvent(String strLog)
{
Systemoutprintln( new SimpleDateFormat(yyyy年MM月dd日HH時mm分ss秒)format(new Date(CalendargetInstance()getTimeInMillis()))+=====>+strLog);
}
// 主函數
public static void main(String[] args) {
if(argslength!=)
{
Systemoutprintln(Usage:java SearchCrawler startUrl maxUrl searchString);
return;
}
@SuppressWarnings(unused)
String strLogPath=args[];
SearchCrawler crawler = new SearchCrawler(args[]);
outdir=args[]+/pic+new SimpleDateFormat(yyyyMMdd)format(new Date(CalendargetInstance()getTimeInMillis()))+/;
File f=new File(outdir);
if(!fexists())
{
fmkdir();
}
execPage = ExecutorsnewFixedThreadPool();
execImg = ExecutorsnewFixedThreadPool();
seroutdir=args[];
seroutdirimg=args[];
ps=new PoCalSearch();
pd=new PoDownload();
try {
if(UtilSerizreadObject(seroutdir)!=null)
{
Systemoutprintln(new SimpleDateFormat(yyyy年MM月dd日HH時mm分ss秒)format(new Date(CalendargetInstance()getTimeInMillis()))+=====>+反序列化URL);
filterUrl=(SimpleBloomFilter)UtilSerizreadObject(seroutdir);
}
else
{
filterUrl=new SimpleBloomFilter();
}
if(UtilSerizreadObject(seroutdir)!=null)
{
Systemoutprintln(new SimpleDateFormat(yyyy年MM月dd日HH時mm分ss秒)format(new Date(CalendargetInstance()getTimeInMillis()))+=====>+反序列化圖片);
filterImg=(SimpleBloomFilter)UtilSerizreadObject(seroutdirimg);
}
else
{
filterImg=new SimpleBloomFilter();
}
} catch (Exception e) {
eprintStackTrace();
}
String strPic=args[]+/pic+new SimpleDateFormat(yyyyMMdd)format(new Date(CalendargetInstance()getTimeInMillis()))+log;
try {
bw=new BufferedWriter(new FileWriter(strPicfalse));
} catch (IOException e) {
// TODO Autogenerated catch block
eprintStackTrace();
}
Thread search=new Thread(crawler);
Systemoutprintln( new SimpleDateFormat(yyyy年MM月dd日HH時mm分ss秒)format(new Date(CalendargetInstance()getTimeInMillis()))+=====>+開始爬圖);
Systemoutprintln(下載了圖:);
searchstart();
try {
searchjoin();
logEvent(主函數結束);
bwclose();
} catch (Exception e) {
// TODO Autogenerated catch block
eprintStackTrace();
}
}
/**
* 說明:下載圖片的線程
* @author binbin
*
*/
public class ImgDownThread implements RunnableCallable<Long>{
//待下載的URL
private String stru;
private boolean isStart=true;
public ImgDownThread(String strurl) {
super();
thisstru = strurl;
}
@Override
public void run()
{
try
{
sempImgacquire();
try{
URL url=new URL(stru);
BufferedInputStream in = new BufferedInputStream(urlopenStream());
BufferedImage bi=ImageIOread(urlopenStream());
//尺寸要求
if (bi==null|| bigetWidth()< || bigetHeight()< )
{
inclose();
return;
}
String ss=new SimpleDateFormat(yyyyMMddHHmmss)format(new Date(CalendargetInstance()getTimeInMillis()))+_+Mathround(Mathrandom()*L+)+strusubstring(strulastIndexOf());
String s=outdir+ss;
FileOutputStream file = new FileOutputStream(new File(s));
int t;
while ((t = inread()) != )
{
filewrite(t);
}
fileclose();
if(new File(s)length()<=*)
{
inclose();
new File(s)delete();
return;
}
synchronized(bw)
{
String str=ss+:+stru;
bwwrite(str);
bwnewLine();
bwflush();
}
logEvent(下載了:+stru);
pssetIntImg(psgetIntImg()+);
inclose();
}catch(Exception e){
logEvent(**********************下載圖片:+stru+超時);
}
}
catch (Exception e)
{
eprintStackTrace();
}
finally{
sempImgrelease();
}
}
public boolean isStart() {
return isStart;
}
public void setStart(boolean isStart) {
thisisStart = isStart;
}
@Override
public Long call() throws Exception {
try
{
sempImgacquire();
try{
URL url=new URL(stru);
BufferedInputStream in = new BufferedInputStream(urlopenStream());
BufferedImage bi=ImageIOread(urlopenStream());
//尺寸要求
if (bi==null|| bigetWidth()< || bigetHeight()< )
{
inclose();
return l;
}
String ss=new SimpleDateFormat(yyyyMMddHHmmss)format(new Date(CalendargetInstance()getTimeInMillis()))+_+Mathround(Mathrandom()*L+)+strusubstring(strulastIndexOf());
String s=outdir+ss;
FileOutputStream file = new FileOutputStream(new File(s));
int t;
while ((t = inread()) != )
{
filewrite(t);
}
fileclose();
if(new File(s)length()<=*)
{
inclose();
new File(s)delete();
return l;
}
logEvent(下載了:+stru);
pssetIntImg(psgetIntImg()+);
inclose();
}catch(Exception e){
logEvent(**********************下載圖片:+stru+超時);
}
}
catch (Exception e)
{
eprintStackTrace();
}
finally{
sempImgrelease();
return l;
}
}
}
/***
* 序列化已訪問的URL
* @author binbin
*
*/
public class TimeWriteFile implements Runnable
{
@Override
public void run()
{
while(blnFlag)
{
try
{
synchronized(ps)
{
logEvent(開始序列化URL);
UtilSerizwriteObject(filterUrlseroutdir);
logEvent(結束序列化URL);
logEvent(開始序列化圖片);
UtilSerizwriteObject(filterImgseroutdirimg);
logEvent(結束序列化圖片);
logEvent(分析了+psgetIntUrl()+個鏈接);
logEvent(下載了+psgetIntImg()+張圖片);
}
Threadsleep();
}
catch (Exception e)
{
eprintStackTrace();
}
}
}
}
/***
* 分析對應URL網頁的線程
* @author Administrator
*
*/
class ParsePage extends Thread
{
String url;
int iCount=;
public int getiCount() {
return iCount;
}
public void setiCount(int iCount) {
thisiCount = iCount;
}
public String getUrl()
{
return url;
}
public void setUrl(String url)
{
thisurl = url;
}
public ParsePage(String url) {
thisurl=url;
}
@Override
public void run()
{
try
{
sempPageacquire();
// Convert string url to URL object
URL verifiedUrl = verifyUrl(url);
// Skip URL if robots are not allowed to access it
if (!isRobotAllowed(verifiedUrl))
{
ThreadcurrentThread()stop();
}
// 增加已處理的URL到crawledList
String pageContents=;
pageContents = downloadPage(verifiedUrl);
logEvent(分析了:+verifiedUrl);
logEvent(待分析URL數:+toCrawlListsize()+個);
if (pageContents != null && pageContentslength() > )
{
// 從頁面中獲取有效的鏈接
ArrayList< String> links =retrieveLinks(verifiedUrl pageContentslimitHost);
// 從頁面中獲取有效的鏈接
ArrayList< String> imglinks =retrieveImgLinks(verifiedUrl pageContentslimitHost);
//添加到圖片下載隊列
if(toCrawlListsize()<)
{
toCrawlListaddAll(links);
}
else
{
logEvent(待分析的網頁URL超過!!!!跳過);
}
for(int i=;i<imglinkssize();i++)
{
if(imglinksget(i)indexOf()!=)
{
iCount++;
filterImgadd(imglinksget(i));
pssetIntImg(psgetIntImg()+);
int intRetryImg=;
while (sempImgavailablePermits() <= )
{
Systemoutprintln(暫時沒有空閒的抓圖線程等待秒再執行);
try {
intRetryImg++;
if(intRetryImg==)
{
logEvent(抓圖+imglinksget(i)+超時);
sempImgrelease();
}
Threadsleep();
} catch (InterruptedException e) {
eprintStackTrace();
}
}
Thread tempImgThread=new Thread(new ImgDownThread(imglinksget(i)));
execImgsubmit(tempImgThread);
if((iCount!=) && (iCount%==) )
{
try
{
logEvent(圖多休息秒);
ThreadcurrentThread()sleep();
}
catch (InterruptedException e)
{
eprintStackTrace();
}
}
}
}
}
synchronized(arrPar)
{
arrParremove(this);
}
}
catch(Exception e)
{
eprintStackTrace();
}
finally
{
sempPagerelease();
}
}
}
}
From:http://tw.wingwit.com/Article/program/Java/hx/201311/25795.html