正好有時間所以用C#寫了一段正則表達式作用是刪除 Page 裡面Code 中的 HTML標簽這在做采集信息消除其中的HTML很有用處
以下是引用片段
public string checkStr(string html)
{
System
Text
RegularExpressions
Regex regex
= new System
Text
RegularExpressions
Regex(@
<script[\s\S]+</script *>
System
Text
RegularExpressions
RegexOptions
IgnoreCase);
System
Text
RegularExpressions
Regex regex
= new System
Text
RegularExpressions
Regex(@
href *= *[\s\S]*script *:
System
Text
RegularExpressions
RegexOptions
IgnoreCase);
System
Text
RegularExpressions
Regex regex
= new System
Text
RegularExpressions
Regex(@
no[\s\S]*=
System
Text
RegularExpressions
RegexOptions
IgnoreCase);
System
Text
RegularExpressions
Regex regex
= new System
Text
RegularExpressions
Regex(@
<iframe[\s\S]+</iframe *>
System
Text
RegularExpressions
RegexOptions
IgnoreCase);
System
Text
RegularExpressions
Regex regex
= new System
Text
RegularExpressions
Regex(@
<frameset[\s\S]+</frameset *>
System
Text
RegularExpressions
RegexOptions
IgnoreCase);
System
Text
RegularExpressions
Regex regex
= new System
Text
RegularExpressions
Regex(@
\<img[^\>]+\>
System
Text
RegularExpressions
RegexOptions
IgnoreCase);
System
Text
RegularExpressions
Regex regex
= new System
Text
RegularExpressions
Regex(@
</p>
System
Text
RegularExpressions
RegexOptions
IgnoreCase);
System
Text
RegularExpressions
Regex regex
= new System
Text
RegularExpressions
Regex(@
<p>
System
Text
RegularExpressions
RegexOptions
IgnoreCase);
System
Text
RegularExpressions
Regex regex
= new System
Text
RegularExpressions
Regex(@
<[^>]*>
System
Text
RegularExpressions
RegexOptions
IgnoreCase);
html = regex
Replace(html
); //過濾<script></script>標記
html = regex
Replace(html
); //過濾href=JavaScript: (<A>) 屬性
html = regex
Replace(html
_disibledevent=
); //過濾其它控件的on
事件
html = regex
Replace(html
); //過濾iframe
html = regex
Replace(html
); //過濾frameset
html = regex
Replace(html
); //過濾frameset
html = regex
Replace(html
); //過濾frameset
html = regex
Replace(html
); //過濾frameset
html = regex
Replace(html
);
html = html
Replace(
);
html = html
Replace(
</strong>
);
html = html
Replace(
<strong>
);
return html;
}
From:http://tw.wingwit.com/Article/program/net/201311/15002.html