的HTML

Question

我想抓取Google搜索结果..

如何在不定义绝对路径的情况下获取所有<div class="g">个元素？

此模式//h3[@class="r"]将获取所有h3元素

此模式//div[@class="g"]不返回任何内容

的HTML

...
    <div id="ires">
        <ol>
            <div class="srg">
                <div class="g">
                    ...
                        <h3 class="r"></h3>
                    ...
                </div>
                <div class="g">
                    ...
                        <h3 class="r"></h3>
                    ...
                </div>
            </div>
        </ol>
    </div>
...

码

$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, 'https://www.google.dk/search?q='.urlencode($query).'&start=0&num=100');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
$html = curl_exec($ch);
$httpcode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);

libxml_use_internal_errors(true);
$doc = new DOMDocument('1.0', 'utf-8');
$doc->validateOnParse = false;
$doc->standalone = true;
$doc->preserveWhiteSpace = true;
$doc->strictErrorChecking = false;
$doc->substituteEntities = false;
$doc->recover = true;
$doc->formatOutput = true;
$doc->loadHTML($html);
libxml_clear_errors();

$div = $doc->getElementById('ires');

$xpath = new DOMXPath($doc);
foreach($xpath->query('//div[@class="g"]', $div) as $node){
    print_r($node);
}

Answer 1

这有效

try{
$ch = curl_init();
   if (FALSE === $ch)
    throw new Exception('failed to initialize');
    curl_setopt($ch, CURLOPT_URL, "https://www.google.ie/search?q=whereNotnull+laravel+5&ie=utf-8&oe=utf-8&gws_rd=cr&ei=FuezVfedF6aC7gaYiJiYBw");
    curl_setopt($ch, CURLOPT_USERAGENT, 'Firefox/14.0');
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    $content = curl_exec($ch);
  if (FALSE === $content)
    throw new Exception(curl_error($ch), curl_errno($ch));
curl_close($ch);
} catch(Exception $e) {
trigger_error(sprintf(
    'Curl failed with error #%d: %s',
    $e->getCode(), $e->getMessage()),
    E_USER_ERROR);
}

$dom = new DOMDocument();
$dom->loadHTML($content);
$xpath = new DOMXPath($dom);

选择所有包含类＆＃34; g＆＃34;

的div

   $xpath_resultset =  $xpath->query('//div[@class="g"]');

循环遍历$ hpath_result节点

  foreach($xpath_resultset as $result){

回显节点作为文本

   echo $dom->saveHTML($result);

echo节点为html代码

   echo htmlspecialchars($dom->saveHTML($result)); // echo as html code
}

使用DOMXPath获取元素

的HTML

码

1 个答案: