熱點推薦:
您现在的位置: 電腦知識網 >> 編程 >> .NET編程 >> 正文

一個用C#過濾HTML代碼的函數

2013-11-13 11:01:21  來源: .NET編程 

  正好有時間所以用C#寫了一段正則表達式作用是刪除 Page 裡面Code 中的 HTML標簽這在做采集信息消除其中的HTML很有用處
以下是引用片段

public string checkStr(string html)
      {
          SystemTextRegularExpressionsRegex regex = new SystemTextRegularExpressionsRegex(@<script[\s\S]+</script *> SystemTextRegularExpressionsRegexOptionsIgnoreCase);
          SystemTextRegularExpressionsRegex regex = new SystemTextRegularExpressionsRegex(@ href *= *[\s\S]*script *: SystemTextRegularExpressionsRegexOptionsIgnoreCase);
          SystemTextRegularExpressionsRegex regex = new SystemTextRegularExpressionsRegex(@ no[\s\S]*= SystemTextRegularExpressionsRegexOptionsIgnoreCase);
          SystemTextRegularExpressionsRegex regex = new SystemTextRegularExpressionsRegex(@<iframe[\s\S]+</iframe *> SystemTextRegularExpressionsRegexOptionsIgnoreCase);
          SystemTextRegularExpressionsRegex regex = new SystemTextRegularExpressionsRegex(@<frameset[\s\S]+</frameset *> SystemTextRegularExpressionsRegexOptionsIgnoreCase);
          SystemTextRegularExpressionsRegex regex = new SystemTextRegularExpressionsRegex(@\<img[^\>]+\> SystemTextRegularExpressionsRegexOptionsIgnoreCase); 
          SystemTextRegularExpressionsRegex regex = new SystemTextRegularExpressionsRegex(@</p> SystemTextRegularExpressionsRegexOptionsIgnoreCase);
          SystemTextRegularExpressionsRegex regex = new SystemTextRegularExpressionsRegex(@<p> SystemTextRegularExpressionsRegexOptionsIgnoreCase);
          SystemTextRegularExpressionsRegex regex = new SystemTextRegularExpressionsRegex(@<[^>]*> SystemTextRegularExpressionsRegexOptionsIgnoreCase);
          html = regexReplace(html ); //過濾<script></script>標記
          html = regexReplace(html ); //過濾href=JavaScript: (<A>) 屬性
          html = regexReplace(html _disibledevent=); //過濾其它控件的on事件
          html = regexReplace(html ); //過濾iframe
          html = regexReplace(html ); //過濾frameset
          html = regexReplace(html ); //過濾frameset
          html = regexReplace(html ); //過濾frameset
          html = regexReplace(html ); //過濾frameset
          html = regexReplace(html );
          html = htmlReplace( );
          html = htmlReplace(</strong> );
          html = htmlReplace(<strong> );
          return html;

From:http://tw.wingwit.com/Article/program/net/201311/15002.html
    Copyright © 2005-2013 電腦知識網 Computer Knowledge   All rights reserved.