在數據采集與頁面分析中
這裡是一個測試例子的實現
代碼如下
/*
匹配給定頁面鏈接
return:array match[link
*/
function match_links($host
$pattern =
preg_match_all($pattern
return $m;
preg_match_all("
while(list($key
if(!empty($val))
if(preg_match("/http/"
$match[
}
else {
$match[
}
}
while(list($key
if(!empty($val))
if(preg_match("/http/"
$match[
}
else {
$match[
}
}
while(list($key
if(!empty($val))
$match[
}
while(list($key
if(!empty($val))
$match[
}
return $match[
}
/*
從給定url中獲取頁面文本內容
*/
function get_content_from_url($url) {
$str = @file_get_contents($url);
if(mb_check_encoding($str
$str = iconv("GBK"
$str = strip_tags($str); // 過濾html標簽
/*
$str = preg_replace( "@<script(
$str = preg_replace( "@<iframe(
$str = preg_replace( "@<style(
$str = preg_replace( "@<(
*/
//過濾非漢字字符
preg_match_all(
$str = join(
if(!$str)
return NULL;
return $str;
}
function get_content($url
if(!$url || $depth <
return false;
while($depth >
$str = @file_get_contents($url);
if(!$str)
return false;
$parseurl = parse_url($url);
if($parseurl[
$host = $parseurl[scheme]
$arrlink = match_links($host
$arr_url = array_unique($arrlink);
$depth
foreach($arr_url as $url){
$content
}
}
$content
return $content;
}
From:http://tw.wingwit.com/Article/program/PHP/201311/21224.html