Question

我正在尝试使用Curl提取网址数据，以下代码无法使用某些服务器或网址，例如：http://www.elwatannews.com/news/details/446177 我能解决这个问题吗？以及如何提取上述网址的标题？

function file_get_site($url){
(function_exists('curl_init')) ? '' : die('cURL Must be installed. Ask your host to      enable it or uncomment extension=php_curl.dll in php.ini');
$curl = curl_init();
$header[0] = "Accept: text/xml,application/xml,application/xhtml+xml,";
$header[0] .= "text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
$header[] = "Cache-Control: max-age=0";
$header[] = "Connection: keep-alive";
$header[] = "Keep-Alive: 300";
$header[] = "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7";
$header[] = "Accept-Language: en-us,en;q=0.5";
$header[] = "Pragma: ";

curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 5.1; rv:5.0)      Gecko/20100101 Firefox/5.0 Firefox/5.0');
curl_setopt($curl, CURLOPT_HTTPHEADER, $header);
curl_setopt($curl, CURLOPT_HEADER, true);
curl_setopt($curl, CURLOPT_REFERER, $url);
curl_setopt($curl, CURLOPT_ENCODING, 'gzip,deflate');
curl_setopt($curl, CURLOPT_AUTOREFERER, true);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl, CURLOPT_TIMEOUT, 60);

$html = curl_exec($curl);

$status = curl_getinfo($curl);
curl_close($curl);

 if($status['http_code']!=200){
 if($status['http_code'] == 301 || $status['http_code'] == 302) {
    list($header) = explode("\r\n\r\n", $html, 2);
    $matches = array();
    preg_match("/(Location:|URI:)[^(\n)]*/", $header, $matches);
    $url = trim(str_replace($matches[1],"",$matches[0]));
    $url_parsed = parse_url($url);
    return (isset($url_parsed))? file_get_site($url):'';
}
$oline='';
foreach($status as $key=>$eline){$oline.='['.$key.']'.$eline.' ';}
$line =$oline." \r\n ".$url."\r\n-----------------\r\n";
$handle = @fopen('./curl.error.log', 'a');
fwrite($handle, $line);
return FALSE;
}
return $html;
}


 function get_content_tags($source,$tag,$id=null,$value=null){
$xml = new DOMDocument();
@$xml->loadHTML($source);

foreach($xml->getElementsByTagName($tag) as $tags) {
    if($id!=null){
        if($tags->getAttribute($id)==$value){
            return $tags->getAttribute('content');
        }
    }
    return $tags->nodeValue;
 }
 }


$source = file_get_site('http://www.elwatannews.com/news/details/446177');

 echo get_content_tags($source,'title'); //FreshDirect

  echo get_content_tags($source,'meta','name','description'); //Online grocer providing   high quality fresh......

?>

更改用户代理后卷曲不起作用

0 个答案: