背景
隨著Internet的普及
網絡信息正以極高的速度增長
在這麼多數據中找到自己需要的信息是一件很繁瑣的事情
找到需要的信息後如何獲取也是件麻煩的事
這就需要Internet信息抓取程序來代替人工的操作
所謂Internet信息抓取程序
就是程序會按照用戶的關鍵詞或關鍵網站來收集相應的信息
並提供給用戶想要的信息格式
信息量的增加會帶來信息網站發布人員工作量的劇增
為實現信息發布系統實現信息自
動發布
減少工作人員工作量
即時跟蹤最新信息
就需要自動信息提供程序
因此Internet信息抓取程序應運而生
目標
實現自定義網站信息分類抓取
存入本地數據庫
生成靜態頁面或其它用戶定義的信息結構
並下載與信息相關的多媒體文件
開發
目標站點結構分析
本步驟是准確抓取信息個關鍵
首先要選擇更新頻率高的頁面做為抓取地址
然後分析要抓取內容頁面url特點
然後分析要抓取信息頁面的元素特性
比如標題位置
內容位置 等
得到定位標記點
將以上信息寫成自己的配置文件或存到數據庫中
每個網站都需要分析
寫出單獨的配置文件
供抓取程序使用
信息提取
根據配置文件取得要抓取頁面url
使用HttpWebRequest類獲取內容
//獲取http頁面函數
public string Get_Http(string a_strUrl
int timeout)
{
string strResult ;
try
{
HttpWebRequest myReq = (HttpWebRequest)HttpWebRequest
Create(a_strUrl) ;
myReq
Timeout = timeout;
HttpWebResponse HttpWResp = (HttpWebResponse)myReq
GetResponse();
Stream myStream = HttpWResp
GetResponseStream () ;
StreamReader sr = new StreamReader(myStream
Encoding
Default);
StringBuilder strBuilder = new StringBuilder();
while (
!= sr
Peek())
{
strBuilder
Append(sr
ReadLine()+
\r\n
);
}
strResult = strBuilder
ToString();
}
catch(Exception exp)
{
strResult =
錯誤
+ exp
Message ;
}
return strResult ;
}
獲取頁面內容後
分析頁面中連接地址取到要抓取的url
//處理頁面標題和鏈接
public string SniffWebUrl( string urlStr
string blockB
string blockE )
{
string urlch
=
;
string urlch
=
;
int end_n
=
;
int end_nums =
;
int end_nums
=
;
int end_nums
=
;
int end_nums
=
;
string reUTStr =
;
string reTitle =
;
string ret =
;
try
{
int pos
= urlStr
IndexOf(
);
int pos
= urlStr
LastIndexOf(
/
);
if( pos
<
)
{
return
;
}
if( pos
<
)
{
return
;
}
int pos
= urlStr
IndexOf(
/
pos
);
if ( pos
<
)
{
urlch
= urlStr;
urlch
= urlStr;
}
else
{
urlch
= urlStr
Substring(
pos
);
urlch
= urlStr
Substring(
pos
);
}
string tmpAllStr = new PublicFun()
Get_Http( urlStr
time
);
int pos
= tmpAllStr
IndexOf( blockB );
int pos
= tmpAllStr
IndexOf( blockE
pos
+ blockB
Length );
if ( pos
>
&& pos
>
&& pos
>pos
)
{
ret = tmpAllStr
Substring( pos
+ blockB
Length
pos
pos
blockB
Length );
ret = ret
Substring( ret
IndexOf(
<
));
while( ret
IndexOf(
<A
) >=
)
{
ret = ret
Substring(
ret
IndexOf(
<A
) ) +
<a
+ ret
Substring( ret
IndexOf(
<A
) +
);
}
while( ret
IndexOf(
</A
) >=
)
{
ret = ret
Substring(
ret
IndexOf(
</A
) ) +
</a
+ ret
Substring( ret
IndexOf(
</A
) +
);
}
while( ret
IndexOf(
Href=
) >=
)
{
ret = ret
Substring(
ret
IndexOf(
Href=
)) +
+ ret
Substring( ret
IndexOf(
Href=
) +
);
}
while( ret
IndexOf(
HREF=
) >=
)
{
ret = ret
Substring(
ret
IndexOf(
HREF=
)) +
+ ret
Substring( ret
IndexOf(
HREF=
) +
);
}
while( ret
IndexOf(
) >=
)
{
ret = ret
Substring(
ret
IndexOf(
)) +
href=\
+ ret
Substring( ret
IndexOf(
) +
);
}
}
tmpAllStr = ret;
int begin_nums = tmpAllStr
IndexOf(
);
while ( begin_nums >=
)
{
string tmpStrA =
;
string tmpStrB = tmpAllStr
Substring( begin_nums +
);
if ( tmpStrB ==
\
)
{
end_n
= begin_nums +
;
if ( ( end_n
+
) > tmpAllStr
Length )
{
return
;
}
tmpStrA = tmpAllStr
Substring( begin_nums+
);
}
else
{
end_n
= begin_nums +
;
tmpStrA = tmpStrB;
}
if ( tmpStrA ==
#
)
{
tmpAllStr = tmpAllStr
Substring( end_n
);
begin_nums = tmpAllStr
IndexOf(
);
}
else
{
end_nums
= tmpAllStr
IndexOf(
end_n
);
end_nums
= tmpAllStr
IndexOf(
>
end_n
);
end_nums
= tmpAllStr
IndexOf(
</a
end_nums
);
if ( ( end_nums
>=
) && ( end_nums
>=
) )
{
reTitle = tmpAllStr
Substring( end_nums
+
end_nums
end_nums
);
if ( end_nums
> end_nums
)
{
end_nums = end_nums
;
}
else
{
if ( end_nums
<
)
{
end_nums = end_nums
;
}
else
{
end_nums = end_nums
;
}
}
string str
= tmpAllStr
Substring( end_nums
end_nums
end_nums +
);
if ( str
==
\
|| str
==
)
{
end_nums = end_nums
;
}
string sTotalOne = tmpAllStr
Substring( end_n
end_nums
end_n
);
if ( sTotalOne
IndexOf( //
) <
)
{
if ( sTotalOne
IndexOf(
/
) ==
)
{
sTotalOne = urlch
+ sTotalOne;
}
else
{
int linshiIntNum =
;
int flags =
;
string urlChange = urlStr;;
while( sTotalOne
IndexOf(
/
) >=
)
{
sTotalOne = sTotalOne
Substring( sTotalOne
IndexOf(
/
) +
);
linshiIntNum = linshiIntNum +
;
flags = flags +
;
}
while( ( urlChange
LastIndexOf(
/
) >=
) && ( linshiIntNum >=
) )
{
urlChange = urlChange
Substring(
urlChange
LastIndexOf(
/
) );
linshiIntNum = linshiIntNum
;
}
if ( flags ==
)
{
sTotalOne = urlch
+
/
+ sTotalOne;
}
else
{
sTotalOne = urlChange +
/
+ sTotalOne;
}
}
}
reUTStr = reUTStr + new PublicFun()
RemoveHtmlCode( reTitle ) + sTotalOne;
tmpAllStr = tmpAllStr
Substring( end_nums
+
);
begin_nums = tmpAllStr
IndexOf(
);
}
else
{
begin_nums =
;
}
}
}
return reUTStr;
}
catch( Exception e)
{
return
;
}
}
得到要抓取內容的url後
處理該頁面
//獲取鏈接內容並分類處理
public string GetWebContent( string gatherUrl
string subUrl
string subTitle
string b_Content
string e_Content
string b_Filter
string e_Filter
string root )
{
string tmpAllStr =
;
string dfStrB =
;
string dfStrE =
;
string rePicStr =
;//圖片返回路徑
string reContentStr =
;
string picHtml =
images
; //本地圖片路徑
string urlch
=
;
string urlch
=
;
int pos
= gatherUrl
IndexOf(
);
int pos
= gatherUrl
LastIndexOf(
/
);
if( pos
<
)
{
return
;
}
if( pos
<
)
{
return
;
}
int pos
= gatherUrl
IndexOf(
/
pos
);
if ( pos
<
)
{
urlch
= gatherUrl;
urlch
= gatherUrl;
}
else
{
urlch
= gatherUrl
Substring(
pos
);
urlch
= gatherUrl
Substring(
pos
);
}
tmpAllStr = new PublicFun()
Get_Http( subUrl
time
);
//取稿源
string docFromStr =
;
if ( dfStrB !=
&& dfStrE !=
)
{
if ( tmpAllStr !=
)
{
int b_docF = tmpAllStr
IndexOf( dfStrB );
if ( b_docF >
)
{
int e_docF = tmpAllStr
IndexOf( dfStrE
b_docF + dfStrB
Length );
if ( e_docF >
&& e_docF > b_docF && e_docF
b_docF <
)
{
docFromStr = tmpAllStr
Substring( b_docF + dfStrB
Length
e_docF
b_docF
dfStrB
Length );
}
}
}
}
//取內容
if ( tmpAllStr !=
)
{
int begin_strnum = tmpAllStr
IndexOf( b_Content );
if ( begin_strnum <
)
{
return
;
}
int end_strnum = tmpAllStr
IndexOf( e_Content
begin_strnum + b_Content
Length );
if ( end_strnum <
)
{
return
;
}
string sTotalSubM =
;
if ( end_strnum > begin_strnum )
{
sTotalSubM = tmpAllStr
Substring ( begin_strnum
end_strnum
begin_strnum );
}
if ( sTotalSubM ==
)
{
return
;
}
//過濾無用信息
int bfnum = sTotalSubM
IndexOf( b_Filter );
if ( bfnum >
)
{
int efnum = sTotalSubM
IndexOf( e_Filter
bfnum );
if ( efnum >
)
{
if ( efnum > bfnum )
{
sTotalSubM = sTotalSubM
Substring(
bfnum ) + sTotalSubM
Substring( efnum + e_Filter
Length );
}
}
}
//格式化圖片標記
while( sTotalSubM
IndexOf(
Src=
) >=
)
{
sTotalSubM = sTotalSubM
Substring(
sTotalSubM
IndexOf(
Src=
) ) +
src=
+ sTotalSubM
Substring( sTotalSubM
IndexOf(
Src=
) +
);
}
while( sTotalSubM
IndexOf(
SRC=
) >=
)
{
sTotalSubM = sTotalSubM
Substring(
sTotalSubM
IndexOf(
SRC=
) ) +
src=
+ sTotalSubM
Substring( sTotalSubM
IndexOf(
SRC=
) +
);
}
while( sTotalSubM
IndexOf(
src=
) >=
)
{
sTotalSubM = sTotalSubM
Substring(
sTotalSubM
IndexOf(
src=
) ) +
src=\
+ sTotalSubM
Substring( sTotalSubM
IndexOf(
src=
) +
);
}
//取圖片地址
int end_n
=
;
int end_nums
=
;
int begin_nums
= sTotalSubM
IndexOf(
src=
);
while( begin_nums
>=
)
{
String tmpStr = sTotalSubM
Substring( begin_nums
+
);
if ( tmpStr ==
\
)
{
end_n
= begin_nums
+
;
}
else
{
end_n
= begin_nums
+
;
}
int end_nums
a = sTotalSubM
IndexOf(
end_n
);
int end_nums
b = sTotalSubM
IndexOf(
>
end_n
);
if ( end_nums
b <
)
{
break;
}
if ( end_nums
a > end_nums
b )
{
end_nums
= end_nums
b;
}
else
{
if (end_nums
a<
)
{
end_nums
= end_nums
b;
}
else
{
end_nums
= end_nums
a;
}
}
tmpStr = sTotalSubM
Substring( end_nums
);
if ( tmpStr ==
\
|| tmpStr ==
)
{
end_nums
= end_nums
;
}
string tmpPicStr = sTotalSubM
Substring( end_n
end_nums
end_n
);
if ( tmpPicStr
IndexOf( //
) <
)
{
if ( tmpPicStr
IndexOf(
/
) ==
)
{
tmpPicStr = urlch
+ tmpPicStr;
}
else
{
int linshiIntNum =
;
int flags =
;
string urlChange = subUrl;
while( tmpPicStr
IndexOf(
/
) >=
)
{
tmpPicStr = tmpPicStr
Substring( tmpPicStr
IndexOf(
/
) +
);
linshiIntNum = linshiIntNum +
;
flags = flags +
;
}
while( ( urlChange
LastIndexOf(
/
) >=
) && ( linshiIntNum >=
) )
{
urlChange = urlChange
Substring(
urlChange
LastIndexOf(
/
) );
linshiIntNum = linshiIntNum
;
}
if ( flags ==
)
{
tmpPicStr = urlch
+
/
+ tmpPicStr;
}
else
{
tmpPicStr = urlChange +
/
+ tmpPicStr;
}
}
}
//tmpPicStr = tmpPicStr
ToLower();
string tmpPicStrTmp = tmpPicStr
ToLower();
//if ( tmpPicStr
IndexOf(
jpg
) >
|| tmpPicStr
IndexOf(
gif
) >
|| tmpPicStr
IndexOf(
bmp
) >
)
if ( tmpPicStrTmp
IndexOf(
jpg
) >
|| tmpPicStrTmp
IndexOf(
gif
) >
|| tmpPicStrTmp
IndexOf(
bmp
) >
)
{
rePicStr = rePicStr +
||
+ tmpPicStr ;
int flagN
= tmpPicStr
LastIndexOf(
/
);
string fileN
= picHtml + tmpPicStr
Substring( flagN
);
sTotalSubM = sTotalSubM
Substring(
end_nums
) +
>******
+ fileN
+
******<
+ sTotalSubM
Substring( end_nums
);
begin_nums
= sTotalSubM
IndexOf(
src=
end_nums
+ fileN
Length +
);
}
else
{
begin_nums
= sTotalSubM
IndexOf(
src=
end_nums
+
);
}
}
if ( rePicStr
Length >
)
rePicStr = rePicStr
Substring(
);
//內容處理 格式化關鍵標記
while( sTotalSubM
IndexOf(
<P
) >=
)
{
sTotalSubM = sTotalSubM
Substring(
sTotalSubM
IndexOf(
<P
) ) +
|****|<
+ sTotalSubM
Substring( sTotalSubM
IndexOf(
<P
) +
);
}
while( sTotalSubM
IndexOf(
<p
) >=
)
{
sTotalSubM = sTotalSubM
Substring(
sTotalSubM
IndexOf(
<p
) ) +
|****|<
+ sTotalSubM
Substring( sTotalSubM
IndexOf(
<p
) +
);
}
while( sTotalSubM
IndexOf(
</P
) >=
)
{
sTotalSubM = sTotalSubM
Substring(
sTotalSubM
IndexOf(
</P
) ) +
|****|<
+ sTotalSubM
Substring( sTotalSubM
IndexOf(
</P
) +
);
}
while( sTotalSubM
IndexOf(
</p
) >=
)
{
sTotalSubM = sTotalSubM
Substring(
sTotalSubM
IndexOf(
</p
) ) +
|****|<
+ sTotalSubM
Substring( sTotalSubM
IndexOf(
</p
) +
);
}
while( sTotalSubM
IndexOf(
<br
) >=
)
{
sTotalSubM = sTotalSubM
Substring(
sTotalSubM
IndexOf(
<br
) ) +
+****+<
+ sTotalSubM
Substring( sTotalSubM
IndexOf(
<br
) +
);
}
while( sTotalSubM
IndexOf(
<BR
) >=
)
{
sTotalSubM = sTotalSubM
Substring(
sTotalSubM
IndexOf(
<BR
) ) +
+****+<
+ sTotalSubM
Substring( sTotalSubM
IndexOf(
<BR
) +
);
}
while( sTotalSubM
IndexOf(
<Br
) >=
)
{
sTotalSubM = sTotalSubM
Substring(
sTotalSubM
IndexOf(
<Br
) ) +
+****+<
+ sTotalSubM
Substring( sTotalSubM
IndexOf(
<Br
) +
);
}
while( sTotalSubM
IndexOf(
<bR
) >=
)
{
sTotalSubM = sTotalSubM
Substring(
sTotalSubM
IndexOf(
<bR
) ) +
+****+<
+ sTotalSubM
Substring( sTotalSubM
IndexOf(
<bR
) +
);
}
//去除html標記
int linshiInt
= sTotalSubM
IndexOf(
<
);
int linshiInt
= sTotalSubM
IndexOf(
>
);
if ( linshiInt
< linshiInt
)
{
sTotalSubM = sTotalSubM
Substring( linshiInt
+
);
}
int linshiInt
= sTotalSubM
LastIndexOf(
<
);
int linshiInt
= sTotalSubM
LastIndexOf(
>
);
if ( linshiInt
< linshiInt
)
{
sTotalSubM = sTotalSubM
Substring(
linshiInt
+
);
}
linshiInt
= sTotalSubM
IndexOf(
<
);
while ( linshiInt
>=
)
{
linshiInt
= sTotalSubM
IndexOf(
>
linshiInt
);
if ( linshiInt
>=
)
{
sTotalSubM = sTotalSubM
Substring(
linshiInt
) + sTotalSubM
Substring( linshiInt
+
);
}
else
{
sTotalSubM = sTotalSubM
Substring(
linshiInt
);
}
linshiInt
= sTotalSubM
IndexOf(
<
);
}
//還原關鍵標記
int linshiInt
=
;
int linshiInt
=
;
while( sTotalSubM
IndexOf(
+****+
) >=
)
{
sTotalSubM = sTotalSubM
Substring(
sTotalSubM
IndexOf(
+****+
) ) +
<br>\n
+ sTotalSubM
Substring( sTotalSubM
IndexOf(
+****+
) +
);
}
while( sTotalSubM
IndexOf(
|****|
) >=
)
{
sTotalSubM = sTotalSubM
Substring(
sTotalSubM
IndexOf(
|****|
) ) +
<br>\n
+ sTotalSubM
Substring( sTotalSubM
IndexOf(
|****|
) +
);
}
while( sTotalSubM
IndexOf(
******
) >=
)
{
linshiInt
= sTotalSubM
IndexOf(
******
) +
;
linshiInt
= sTotalSubM
IndexOf(
******
linshiInt
);
if ( linshiInt
>=
)
{
int tmpPos = sTotalSubM
IndexOf(
******
);
string tmpStr
= sTotalSubM
Substring(
tmpPos );
string tmpStr
= sTotalSubM
Substring( linshiInt
linshiInt
linshiInt
);
string tmpStr
= sTotalSubM
Substring( linshiInt
+
);
sTotalSubM = tmpStr
+
<img src=
+ tmpStr
+
>
+ tmpStr
;
}
else
{
break;
}
}
//去除內容中的標題
if ( sTotalSubM
IndexOf( subTitle ) >=
)
{
sTotalSubM = sTotalSubM
Substring(
sTotalSubM
IndexOf( subTitle ) ) + sTotalSubM
Substring( sTotalSubM
IndexOf( subTitle ) + subTitle
Length );
}
reContentStr = sTotalSubM;
//調用下載圖片功能
//下載圖片到指定目錄
string[] img_Url = new PublicFun()
split( rePicStr
||
);
for ( int i=
;i<img_Url
Length;i++ )
{
if ( img_Url[i] !=
)
{
new PublicFun()
Get_Img( img_Url[i]
root +
\\images\\
+ img_Url[i]
Substring( img_Url[i]
LastIndexOf(
/
)+
) );
}
}
}
return reContentStr;
}
以上方法返回要取得的信息
包括標題內容
圖片地址等
下載頁面中圖片
//下載圖片
public void Get_Img(string a_strUrl
int timeout
string filepath)
{
try
{
HttpWebRequest myReq = (HttpWebRequest)HttpWebRequest
Create(a_strUrl) ;
myReq
Timeout = timeout;
HttpWebResponse HttpWResp = (HttpWebResponse)myReq
GetResponse();
Stream myStream = HttpWResp
GetResponseStream () ;
Bitmap map = new Bitmap( myStream );
PictureBox picB = new PictureBox();
picB
Image = (Image)map;
string path = filepath
Substring(
filepath
LastIndexOf(
\\
) );
if (!Directory
Exists(path))
{
CreateDir( path );
}
picB
Image
Save(filepath);
}
catch(Exception exp)
{
string ss = exp
Message;
WriteLog( filepath
Substring(
filepath
LastIndexOf(
\\
)) +
\\error
log
a_strUrl +
+ ss +
\r\n
);
}
}
保存文件或入庫
上面取得的信息可以按自己的要求保存
****設計的時候沒有使用url按層次循環抓取
這樣定義抓取url效率更高
速度更快
測試程序下載
如有建議請發送EMail
或msn
注
此版本只提供靜態文件存儲功能
不提供數據庫接口
不提供自定義網站功能
本程序運行需要先安 框架
From:http://tw.wingwit.com/Article/program/net/201311/12401.html