熱點推薦:
您现在的位置: 電腦知識網 >> 編程 >> .NET編程 >> 正文

利用C#實現web信息自動抓取

2013-11-13 10:03:10  來源: .NET編程 

背景
 
 隨著Internet的普及網絡信息正以極高的速度增長在這麼多數據中找到自己需要的信息是一件很繁瑣的事情找到需要的信息後如何獲取也是件麻煩的事這就需要Internet信息抓取程序來代替人工的操作
所謂Internet信息抓取程序就是程序會按照用戶的關鍵詞或關鍵網站來收集相應的信息並提供給用戶想要的信息格式
 
 信息量的增加會帶來信息網站發布人員工作量的劇增為實現信息發布系統實現信息自
動發布減少工作人員工作量即時跟蹤最新信息就需要自動信息提供程序因此Internet信息抓取程序應運而生
 
目標
 
 實現自定義網站信息分類抓取存入本地數據庫生成靜態頁面或其它用戶定義的信息結構並下載與信息相關的多媒體文件
 
開發
 
目標站點結構分析
本步驟是准確抓取信息個關鍵
首先要選擇更新頻率高的頁面做為抓取地址然後分析要抓取內容頁面url特點
然後分析要抓取信息頁面的元素特性比如標題位置內容位置 等得到定位標記點
將以上信息寫成自己的配置文件或存到數據庫中
每個網站都需要分析寫出單獨的配置文件供抓取程序使用
 
信息提取
根據配置文件取得要抓取頁面url使用HttpWebRequest類獲取內容
//獲取http頁面函數
        public string Get_Http(string a_strUrlint timeout)
        {
            string strResult ;        
 
            try
            {
HttpWebRequest myReq = (HttpWebRequest)HttpWebRequestCreate(a_strUrl) ;
                myReqTimeout = timeout;
                HttpWebResponse HttpWResp = (HttpWebResponse)myReqGetResponse();
           
                Stream myStream = HttpWRespGetResponseStream () ;
 
                StreamReader sr = new StreamReader(myStream EncodingDefault);
                StringBuilder strBuilder = new StringBuilder();
                while ( != srPeek())
                {
                    strBuilderAppend(srReadLine()+\r\n);
                }
 
                strResult = strBuilderToString();
            }
            catch(Exception exp)
            {
                strResult = 錯誤 + expMessage ;
            }
 
            return strResult ;
 
        }
獲取頁面內容後分析頁面中連接地址取到要抓取的url
//處理頁面標題和鏈接
        public string SniffWebUrl( string urlStrstring blockBstring blockE )
        {      
            string urlch = ;
            string urlch = ;                   
            int end_n = ;
            int end_nums = ;
            int end_nums = ;
            int end_nums = ;
            int end_nums     = ;           
            string reUTStr = ;
            string reTitle = ;
            string ret = ;          
            try
            {
                int pos = urlStrIndexOf( );
                int pos = urlStrLastIndexOf( / );
                if( pos < )
                {
                    return ;
                }
                if( pos < )
                {
                    return ;
                }
                int pos = urlStrIndexOf( /pos );
                if ( pos < )
                {
                    urlch = urlStr;
                    urlch = urlStr;
                }
                else
                {
                    urlch = urlStrSubstring( pos );
                    urlch = urlStrSubstring( pos );
                }
 
                string tmpAllStr = new PublicFun()Get_Http( urlStr time);
 
                int pos = tmpAllStrIndexOf( blockB );
                int pos = tmpAllStrIndexOf( blockEpos + blockBLength );
                if ( pos> && pos> && pos>pos )
                {
                    ret = tmpAllStrSubstring( pos + blockBLengthpos pos blockBLength );
                    ret = retSubstring( retIndexOf( < ));
                    while( retIndexOf( <A ) >= )
                    {
                        ret = retSubstring( retIndexOf( <A ) ) + <a + retSubstring( retIndexOf( <A ) + );
                    }
                    while( retIndexOf( </A ) >= )
                    {
                        ret = retSubstring( retIndexOf( </A ) ) + </a + retSubstring( retIndexOf( </A ) + );
                    }
                    while( retIndexOf( Href= ) >= )
                    {
                        ret = retSubstring( retIndexOf( Href= )) + + retSubstring( retIndexOf( Href= ) + );
                    }
                    while( retIndexOf( HREF= ) >= )
                    {
                        ret = retSubstring( retIndexOf( HREF= )) + + retSubstring( retIndexOf( HREF= ) + );
                    }
                    while( retIndexOf( ) >= )
                    {
                        ret = retSubstring( retIndexOf( )) + href=\ + retSubstring( retIndexOf( ) + );
                    }
                }      
                tmpAllStr = ret;     
                int begin_nums = tmpAllStrIndexOf( );
 
                while ( begin_nums >= )
                {              
                    string tmpStrA = ;
                    string tmpStrB = tmpAllStrSubstring( begin_nums + );
                    if ( tmpStrB == \ )
                    {
                        end_n = begin_nums + ;
                        if ( ( end_n + ) > tmpAllStrLength )
                        {
                            return ;
                        }
                        tmpStrA = tmpAllStrSubstring( begin_nums+ );
                    }
                    else
                    {
                        end_n = begin_nums + ;
                        tmpStrA = tmpStrB;
                    }
 
                    if ( tmpStrA == # )
                    {
                        tmpAllStr = tmpAllStrSubstring( end_n );
                        begin_nums = tmpAllStrIndexOf( );
                    }
                    else
                    {                  
                        end_nums = tmpAllStrIndexOf( end_n );
                        end_nums = tmpAllStrIndexOf( >end_n );
                        end_nums = tmpAllStrIndexOf( </aend_nums );
 
                        if ( ( end_nums >= ) && ( end_nums >= ) )
                        {
                            reTitle = tmpAllStrSubstring( end_nums + end_nums end_nums );
 
                            if ( end_nums > end_nums )
                            {
                                end_nums = end_nums;
                            }
                            else
                            {
                                if ( end_nums < )
                                {
                                    end_nums = end_nums;
                                }
                                else
                                {
                                    end_nums = end_nums;
                                }
                            }
                            string str = tmpAllStrSubstring( end_nums end_nums end_nums + );
 
                            if ( str ==\ || str == )
                            {
                                end_nums = end_nums ;
                            }
                            string sTotalOne = tmpAllStrSubstring( end_nend_nums end_n );
 
                            if ( sTotalOneIndexOf( // ) < )
                            {
                                if ( sTotalOneIndexOf( / ) == )
                                {
                                    sTotalOne = urlch + sTotalOne;
                                }
                                else
                                {                              
                                    int linshiIntNum = ;
                                    int flags = ;
                                    string urlChange = urlStr;;
                                    while( sTotalOneIndexOf( / ) >= )
                                    {
                                        sTotalOne = sTotalOneSubstring( sTotalOneIndexOf( / ) + );
                                        linshiIntNum = linshiIntNum + ;
                                        flags = flags +;
                                    }
                                    while( ( urlChangeLastIndexOf( / ) >= ) && ( linshiIntNum >= ) )
                                    {
                                        urlChange = urlChangeSubstring( urlChangeLastIndexOf( / ) );
                                        linshiIntNum = linshiIntNum ;
                                    }
                                    if ( flags == )
                                    {
                                        sTotalOne = urlch + / + sTotalOne;
                                    }
                                    else
                                    {
                                        sTotalOne = urlChange + / + sTotalOne;
                                    }
                                }
                            }
                            reUTStr = reUTStr + new PublicFun()RemoveHtmlCode( reTitle ) + sTotalOne;
 
                            tmpAllStr = tmpAllStrSubstring( end_nums + );
                            begin_nums = tmpAllStrIndexOf( );
                        }
                        else
                        {
                            begin_nums = ;
                        }                   
                    }
                }
                return reUTStr;
            }
            catch( Exception e)
            {
                return ;
            }
        }
 
得到要抓取內容的url後處理該頁面
//獲取鏈接內容並分類處理
        public string GetWebContent( string gatherUrlstring subUrlstring subTitlestring b_Contentstring e_Contentstring b_Filterstring e_Filterstring root )
        {
            string tmpAllStr = ;           
            string dfStrB = ;
            string dfStrE = ;               
            string rePicStr = ;//圖片返回路徑   
            string reContentStr = ;
            string picHtml = images; //本地圖片路徑
           
            string urlch =;
            string urlch =;
            int pos = gatherUrlIndexOf( );
            int pos = gatherUrlLastIndexOf( / );
            if( pos < )
            {
                return ;
            }
            if( pos < )
            {               
                return ;
            }
            int pos = gatherUrlIndexOf( /pos );
            if ( pos < )
            {
                urlch = gatherUrl;
                urlch = gatherUrl;
            }
            else
            {
                urlch = gatherUrlSubstring( pos );
                urlch = gatherUrlSubstring( pos );
            }   
           
            tmpAllStr = new PublicFun()Get_Http( subUrltime );
            //取稿源
            string docFromStr = ;
            if ( dfStrB != && dfStrE != )
            {
                if ( tmpAllStr != )
                {
                    int b_docF = tmpAllStrIndexOf( dfStrB );
                    if ( b_docF > )
                    {
                        int e_docF = tmpAllStrIndexOf( dfStrEb_docF + dfStrBLength );
                        if ( e_docF > && e_docF > b_docF && e_docF b_docF < )
                        {
                            docFromStr = tmpAllStrSubstring( b_docF + dfStrBLength e_docF b_docF dfStrBLength );
                        }
                    }
                }
            }
            //取內容
            if ( tmpAllStr != )
            {               
                int begin_strnum = tmpAllStrIndexOf( b_Content );
                if ( begin_strnum < )
                {                  
                    return ;
                }
                int end_strnum = tmpAllStrIndexOf( e_Contentbegin_strnum + b_ContentLength );
                if ( end_strnum < )
                {                  
                    return ;
                }
                string sTotalSubM = ;
                if ( end_strnum > begin_strnum )
                {
                    sTotalSubM = tmpAllStrSubstring ( begin_strnumend_strnum begin_strnum );
                }
               
                if ( sTotalSubM == )
                {                  
                    return ;
                }              
                //過濾無用信息
                int bfnum = sTotalSubMIndexOf( b_Filter );
                if ( bfnum > )
                {
                    int efnum = sTotalSubMIndexOf( e_Filterbfnum );
                    if ( efnum > )
                    {
                        if ( efnum > bfnum )
                        {
                            sTotalSubM = sTotalSubMSubstring( bfnum ) + sTotalSubMSubstring( efnum + e_FilterLength );
                        }
                    }
                }
                //格式化圖片標記
               
                while( sTotalSubMIndexOf( Src= ) >= )
                {
                    sTotalSubM = sTotalSubMSubstring( sTotalSubMIndexOf( Src= ) ) + src= + sTotalSubMSubstring( sTotalSubMIndexOf( Src= ) + );
                }
                while( sTotalSubMIndexOf( SRC= ) >= )
                {
                    sTotalSubM = sTotalSubMSubstring( sTotalSubMIndexOf( SRC= ) ) + src= + sTotalSubMSubstring( sTotalSubMIndexOf( SRC= ) + );
                }
                while( sTotalSubMIndexOf( src= ) >= )
                {
                    sTotalSubM = sTotalSubMSubstring( sTotalSubMIndexOf( src= ) ) + src=\ + sTotalSubMSubstring( sTotalSubMIndexOf( src= ) + );
                }
 
                //取圖片地址
                int end_n = ;
                int end_nums = ;
                int begin_nums = sTotalSubMIndexOf( src= );
                while( begin_nums >= )
                {
                    String tmpStr = sTotalSubMSubstring( begin_nums + );
                    if ( tmpStr == \ )
                    {
                        end_n = begin_nums + ;
                    }
                    else
                    {
                        end_n = begin_nums + ;
                    }
                    int end_numsa = sTotalSubMIndexOf( end_n );
                    int end_numsb = sTotalSubMIndexOf( >end_n );
                    if ( end_numsb < )
                    {
                        break;
                    }
                    if ( end_numsa > end_numsb )
                    {
                        end_nums = end_numsb;
                    }
                    else
                    {
                        if (end_numsa<)
                        {
                            end_nums = end_numsb;
                        }
                        else
                        {
                            end_nums = end_numsa;
                        }
                    }
                    tmpStr = sTotalSubMSubstring( end_nums );
                    if ( tmpStr == \ || tmpStr == )
                    {
                        end_nums = end_nums ;
                    }
                    string tmpPicStr = sTotalSubMSubstring( end_nend_nums end_n );
 
                    if ( tmpPicStrIndexOf( // ) < )
                    {
                        if ( tmpPicStrIndexOf( / ) == )
                        {
                            tmpPicStr = urlch + tmpPicStr;
                        }
                        else
                        {                           
                            int linshiIntNum = ;
                            int flags = ;
                            string urlChange = subUrl;
                            while( tmpPicStrIndexOf( / ) >= )
                            {
                                tmpPicStr = tmpPicStrSubstring( tmpPicStrIndexOf(/) + );
                                linshiIntNum = linshiIntNum + ;
                                flags = flags + ;
                            }
                            while( ( urlChangeLastIndexOf( / ) >= ) && ( linshiIntNum >= ) )
                            {
                                urlChange = urlChangeSubstring( urlChangeLastIndexOf( / ) );
                                linshiIntNum = linshiIntNum ;
                            }
                            if ( flags == )
                            {
                                tmpPicStr = urlch + / + tmpPicStr;
                            }
                            else
                            {
                                tmpPicStr = urlChange + / + tmpPicStr;
                            }
                        }
                    }
                    //tmpPicStr = tmpPicStrToLower();
                    string tmpPicStrTmp = tmpPicStrToLower();
                    //if ( tmpPicStrIndexOf( jpg ) > || tmpPicStrIndexOf( gif ) > || tmpPicStrIndexOf( bmp ) > )
                    if ( tmpPicStrTmpIndexOf( jpg ) > || tmpPicStrTmpIndexOf( gif ) > || tmpPicStrTmpIndexOf( bmp ) > )
                    {
                        rePicStr = rePicStr + || + tmpPicStr ;
 
                        int flagN = tmpPicStrLastIndexOf( / );
                        string fileN = picHtml + tmpPicStrSubstring( flagN );
                        sTotalSubM = sTotalSubMSubstring( end_nums ) + >****** + fileN + ******< + sTotalSubMSubstring( end_nums );
 
                        begin_nums = sTotalSubMIndexOf( src= end_nums + fileNLength + );
                    }
                    else
                    {
                        begin_nums = sTotalSubMIndexOf( src= end_nums + );                       
                    }                  
                }
                if ( rePicStrLength >
                    rePicStr = rePicStrSubstring();              
 
                //內容處理 格式化關鍵標記
                while( sTotalSubMIndexOf( <P ) >= )
                {
                    sTotalSubM = sTotalSubMSubstring( sTotalSubMIndexOf( <P ) ) + |****|< + sTotalSubMSubstring( sTotalSubMIndexOf( <P ) + );
                }
                while( sTotalSubMIndexOf( <p ) >= )
                {
                    sTotalSubM = sTotalSubMSubstring( sTotalSubMIndexOf( <p ) ) + |****|< + sTotalSubMSubstring( sTotalSubMIndexOf( <p ) + );
                }
                while( sTotalSubMIndexOf( </P ) >= )
                {
                    sTotalSubM = sTotalSubMSubstring( sTotalSubMIndexOf( </P ) ) + |****|< + sTotalSubMSubstring( sTotalSubMIndexOf( </P ) + );
                }
                while( sTotalSubMIndexOf( </p ) >= )
                {
                    sTotalSubM = sTotalSubMSubstring( sTotalSubMIndexOf( </p ) ) + |****|< + sTotalSubMSubstring( sTotalSubMIndexOf( </p ) + );
                }
                while( sTotalSubMIndexOf( <br ) >= )
                {
                    sTotalSubM = sTotalSubMSubstring( sTotalSubMIndexOf( <br ) ) + +****+< + sTotalSubMSubstring( sTotalSubMIndexOf( <br ) + );
                }
                while( sTotalSubMIndexOf( <BR ) >= )
                {
                    sTotalSubM = sTotalSubMSubstring( sTotalSubMIndexOf( <BR ) ) + +****+< + sTotalSubMSubstring( sTotalSubMIndexOf( <BR ) + );
                }
                while( sTotalSubMIndexOf( <Br ) >= )
                {
                    sTotalSubM = sTotalSubMSubstring( sTotalSubMIndexOf( <Br ) ) + +****+< + sTotalSubMSubstring( sTotalSubMIndexOf( <Br ) + );
                }
                while( sTotalSubMIndexOf( <bR ) >= )
                {
                    sTotalSubM = sTotalSubMSubstring( sTotalSubMIndexOf( <bR ) ) + +****+< + sTotalSubMSubstring( sTotalSubMIndexOf( <bR ) + );
                }
 
                //去除html標記
                int linshiInt = sTotalSubMIndexOf( < );
                int linshiInt = sTotalSubMIndexOf( > );           
 
                if ( linshiInt < linshiInt )
                {
                    sTotalSubM = sTotalSubMSubstring( linshiInt + );
                }
                int linshiInt = sTotalSubMLastIndexOf( < );
                int linshiInt = sTotalSubMLastIndexOf( > );
                if ( linshiInt < linshiInt )
                {
                    sTotalSubM = sTotalSubMSubstring( linshiInt + );
                }
                linshiInt = sTotalSubMIndexOf( < );
                while ( linshiInt >= )
                {
                    linshiInt = sTotalSubMIndexOf( >linshiInt );
                    if ( linshiInt >= )
                    {              
                        sTotalSubM = sTotalSubMSubstring( linshiInt ) + sTotalSubMSubstring( linshiInt + );
                    }
                    else
                    {
                        sTotalSubM = sTotalSubMSubstring( linshiInt );
                    }
                    linshiInt = sTotalSubMIndexOf(<);
                }
 
                //還原關鍵標記
                int linshiInt = ;
                int linshiInt = ;
 
                while( sTotalSubMIndexOf( +****+ ) >= )
                {
                    sTotalSubM = sTotalSubMSubstring( sTotalSubMIndexOf( +****+ ) ) + <br>\n + sTotalSubMSubstring( sTotalSubMIndexOf( +****+ ) + );
                }
                while( sTotalSubMIndexOf( |****| ) >= )
                {
                    sTotalSubM = sTotalSubMSubstring( sTotalSubMIndexOf( |****| ) ) + <br>\n + sTotalSubMSubstring( sTotalSubMIndexOf( |****| ) + );
                }
                while( sTotalSubMIndexOf( ****** ) >= )
                {
                    linshiInt = sTotalSubMIndexOf( ****** ) + ;
                    linshiInt = sTotalSubMIndexOf( ******linshiInt );
                    if ( linshiInt >= )
                    {
                        int tmpPos = sTotalSubMIndexOf( ****** );
                        string tmpStr = sTotalSubMSubstring( tmpPos );
                        string tmpStr = sTotalSubMSubstring( linshiIntlinshiInt linshiInt );
                        string tmpStr = sTotalSubMSubstring( linshiInt + );
                        sTotalSubM = tmpStr + <img src= + tmpStr + > + tmpStr;
                    }
                    else
                    {
                        break;
                    }
                }
                //去除內容中的標題
                if ( sTotalSubMIndexOf( subTitle ) >= )
                {
                    sTotalSubM = sTotalSubMSubstring( sTotalSubMIndexOf( subTitle ) ) + sTotalSubMSubstring( sTotalSubMIndexOf( subTitle ) + subTitleLength );
                }
                reContentStr = sTotalSubM;
                //調用下載圖片功能
                //下載圖片到指定目錄
                string[] img_Url = new PublicFun()split( rePicStr|| );
                for ( int i=;i<img_UrlLength;i++ )
                {
                    if ( img_Url[i] != )
                    {
                        new PublicFun()Get_Img( img_Url[i]root + \\images\\ + img_Url[i]Substring( img_Url[i]LastIndexOf(/)+ ) );
                    }
                }
            }
            return reContentStr;
 
        }
以上方法返回要取得的信息包括標題內容圖片地址等
 
下載頁面中圖片
//下載圖片
        public void Get_Img(string a_strUrlint timeoutstring filepath)
        {
            try
            {
HttpWebRequest myReq = (HttpWebRequest)HttpWebRequestCreate(a_strUrl) ;
                myReqTimeout = timeout;
                HttpWebResponse HttpWResp = (HttpWebResponse)myReqGetResponse();        
                Stream myStream = HttpWRespGetResponseStream () ;         
                Bitmap map = new Bitmap( myStream );
                PictureBox picB = new PictureBox();
                picBImage = (Image)map;
                string path = filepathSubstring( filepathLastIndexOf( \\ ) );
                if (!DirectoryExists(path))
                {
                    CreateDir( path );
                }              
                picBImageSave(filepath);               
            }
            catch(Exception exp)
            {
                string ss = expMessage;
    WriteLog( filepathSubstring(filepathLastIndexOf(\\)) + \\errorloga_strUrl + + ss + \r\n);    
            }
        }
 
保存文件或入庫
上面取得的信息可以按自己的要求保存
 
****設計的時候沒有使用url按層次循環抓取這樣定義抓取url效率更高速度更快
 
 
測試程序下載
 
如有建議請發送EMail 或msn
 
此版本只提供靜態文件存儲功能不提供數據庫接口不提供自定義網站功能
    本程序運行需要先安 框架


From:http://tw.wingwit.com/Article/program/net/201311/12401.html
    推薦文章
    Copyright © 2005-2013 電腦知識網 Computer Knowledge   All rights reserved.