|
|
本帖最后由 gao15678 于 2017-1-13 00:58 编辑
原代码:function utf8_to_gbk($str_caiji){
return mb_convert_encoding($str_caiji, 'gbk', 'utf-8');
}
if(stristr($moban, '{apineirong}')){
$str_caiji = file_get_contents('http://www.chinanews.com/china.shtml?qq-pf-to=pcqq.c2c');←这里是采集的网址
$reg_caiji = '/<div class="\w+"><a.*?>\s*(.*?)\s*<\/a>.*?/';←这里是采集设置的正则
preg_match_all($reg_caiji, $str_caiji, $out_caiji);
$str_caiji = '';
foreach($out_caiji[0] as $v_caiji){
$str_caiji .= $v_caiji;
}
$reg_caiji = '/(\/(\d|\w)+\-*(\d|\w)+)+.shtml/i';←这里是采集设置的正则
preg_match_all($reg_caiji, $str_caiji, $out_caiji);
$num = count($out_caiji[0]);
if($out_caiji[0][mt_rand(0, $num)-1]){
$link = 'http://www.chinanews.com' . $out_caiji[0][mt_rand(0, $num)];←网址
$html = file_get_html($link);
$title = $html -> find('h1', 0) -> innertext;
$body = utf8_to_gbk($html -> find('div[class=left_zw]', 0) -> outertext) ;←这里是采集设置的正则
$body = preg_replace("/<(\/)?a .*?>/si", '', $body);←这里是采集设置的正则
$body = preg_replace("/<\/a>/si", '', $body);←这里是采集设置的正则
$body = preg_replace("/(\/\w+\/\d+\/\d+\/\d+\.\w+)/", 'http://www.chinanews.com' . '\1', $body);←这里是采集设置的正则和网址
$moban = str_replace('{apibiaoti}', $title, $moban);
$moban = str_replace('{apineirong}', $body, $moban);
echo $moban;
}else{
echo $moban;
}
}
我想改为采集这个网址的:http://roll.blog.sina.com.cn/list/other/index_1.shtml 不知道怎么改,小白请各路大神、超神帮忙改下。小白磕头感谢!!!! |
|