使用perl / shell将HTML表转换为csv

时间:2015-09-10 06:13:52

标签: html perl extract

我有一个包含表格的网站。我想将表数据提取到csv文件。我会定期使用wget获取本网站的html文件。

现在,我想做的是使用perl / shell脚本从这个html文件中提取表数据。

以下代码转换webdata和extract表。这不是我想要做的。我想从html文件中提取表格(我手动下载)。要提取的表:html下面的 JobHistory 。谁可以帮我这个事??。提前致谢

#!/usr/bin/perl
use warnings;
use strict;
use LWP::Simple;
use HTML::TableExtract;

my $html = get 'http://xxx';

my $te = new HTML::TableExtract( headers => ['a', 'b'] );
$te->parse($html);
foreach my $ts ($te->table_states) {
   foreach my $row ($ts->rows) {
       next unless $row->[0] =~ /\w/;   # skip garbage rows
       printf "%-20s   ==>   %.2f\n", $row->[0], $row->[1];
   }
}

我的html页面:

<html>
  <meta http-equiv="X-UA-Compatible" content="IE=8">
  <meta http-equiv="Content-type" content="text/html; charset=UTF-8">
  <style type="text/css">
    #jobs_paginate span {font-weight:normal}
    #jobs .progress {width:8em}
    #jobs_processing {top:-1.5em; font-size:1em;
      color:#000; background:rgba(255, 255, 255, 0.8)}
  </style>
  <title>
    JobHistory
  </title>
  <link rel="stylesheet" href="/static/yarn.css">
  <style type="text/css">
    #layout { height: 100%; }
    #layout thead td { height: 3em; }
    #layout #navcell { width: 11em; padding: 0 1em; }
    #layout td.content { padding-top: 0 }
    #layout tbody { vertical-align: top; }
    #layout tfoot td { height: 4em; }
  </style>
  <link rel="stylesheet" href="/static/jquery/themes-1.9.1/base/jquery-ui.css">
  <link rel="stylesheet" href="/static/dt-1.9.4/css/jui-dt.css">
  <script type="text/javascript" src="/static/jquery/jquery-1.8.2.min.js">
  </script>
  <script type="text/javascript" src="/static/jquery/jquery-ui-1.9.1.custom.min.js">
  </script>
  <script type="text/javascript" src="/static/dt-1.9.4/js/jquery.dataTables.min.js">
  </script>
  <script type="text/javascript" src="/static/yarn.dt.plugins.js">
  </script>
  <style type="text/css">
    #jsnotice { padding: 0.2em; text-align: center; }
    .ui-progressbar { height: 1em; min-width: 5em }
  </style>
  <script type="text/javascript">
    $(function() {
      $('#nav').accordion({autoHeight:false, active:0});
    jobsDataTable =  $('#jobs').dataTable({bStateSave : true, "fnStateSave": function (oSettings, oData) { sessionStorage.setItem( oSettings.sTableId, JSON.stringify(oData) ); }, "fnStateLoad": function (oSettings) { return JSON.parse( sessionStorage.getItem(oSettings.sTableId) );}, bJQueryUI:true, sPaginationType: 'full_numbers', iDisplayLength:20, aLengthMenu:[20, 40, 60, 80, 100], 'aaData': jobsTableData, bDeferRender: true, bProcessing: true, aaSorting: [[2, 'desc']], aoColumnDefs:[{'sType':'numeric', 'bSearchable': false, 'aTargets': [ 7, 8, 9, 10 ] }]}).fnSetFilteringDelay(188);
    var asInitVals = new Array();
$('tfoot input').keyup( function () 
{  jobsDataTable.fnFilter( this.value, $('tfoot input').index(this) );
} );
$('tfoot input').each( function (i) {
  asInitVals[i] = this.value;
} );
$('tfoot input').focus( function () {
  if ( this.className == 'search_init' )
  {
    this.className = '';
    this.value = '';
  }
} );
$('tfoot input').blur( function (i) {
  if ( this.value == '' )
  {
    this.className = 'search_init';
    this.value = asInitVals[$('tfoot input').index(this)];
  }
} );

    });
  </script>
  <div id="jsnotice" class="ui-state-error">
    This page works best with javascript enabled.
  </div>
  <script type="text/javascript">
    $('#jsnotice').hide();
  </script>
  <table id="layout" class="ui-widget-content">
    <thead>
      <tr>
        <td colspan="2">
          <div id="header" class="ui-widget">
            <div id="user">
              Logged in as: dr.who
            </div>
            <div id="logo">
              <img src="/static/hadoop-st.png">
            </div>
            <h1>
              JobHistory
            </h1>
          </div>
        </td>
      </tr>
    </thead>
    <tfoot>
      <tr>
        <td colspan="2">
          <div id="footer" class="ui-widget">
          </div>
        </td>
      </tr>
    </tfoot>
    <tbody>
      <tr>
        <td id="navcell">
          <div id="nav">
            <h3>
              Application
            </h3>
            <ul>
              <li>
                <a href="/jobhistory/about">About</a>
              <li>
                <a href="/jobhistory/app">Jobs</a>
            </ul>
            <h3>
              Tools
            </h3>
            <ul>
              <li>
                <a href="/conf">Configuration</a>
              <li>
                <a href="/logs">Local logs</a>
              <li>
                <a href="/stacks">Server stacks</a>
              <li>
                <a href="/metrics">Server metrics</a>
            </ul>
          </div>
        </td>
        <td class="content">
          <h2>
            Retired Jobs
          </h2>
          <table id="jobs">
            <thead>
              <tr>
                <th>
                  Submit Time
                </th>
                <th>
                  Start Time
                </th>
                <th>
                  Finish Time
                </th>
                <th class="id">
                  Job ID
                </th>
                <th class="name">
                  Name
                </th>
                <th>
                  User
                </th>
                <th>
                  Queue
                </th>
                <th class="state">
                  State
                </th>
                <th>
                  Maps Total
                </th>
                <th>
                  Maps Completed
                </th>
                <th>
                  Reduces Total
                </th>
                <th>
                  Reduces Completed
                </th>
              </tr>
            </thead>
            <script type="text/javascript">
            var jobsTableData=[
    ["2015.09.06 11:49:36 GMT","2015.09.06 11:49:42 GMT","2015.09.06 11:53:05     GMT","<a  href=xyz</a>","PigLatin:ctx_decode2.pig","hdfswrite","ingest","SUCCEEDED","10","10","39","39"],
    ["2015.09.06 11:49:59 GMT","2015.09.06 11:50:02 GMT","2015.09.06 11:50:19 GMT","<a href=abc</a>","PigLatin:ctx_decode2.pig","hdfswrite","ingest","SUCCEEDED","1","1","1","1"],
    ["2015.09.06 11:50:27 GMT","2015.09.06 11:50:34 GMT","2015.09.06 11:51:48 GMT","<a href=wer</a>","PigLatin:ctx_decode2.pig","hdfswrite","ingest","SUCCEEDED","1","1","2","2"],
   ["2015.09.06 11:50:48 GMT","2015.09.06 11:50:52 GMT","2015.09.06 11:51:10 GMT","<a href=xyz</a>","INSERT INTO TABLE idl...st_summary2_run_data(Stage","hdfswrite","ingest","SUCCEEDED","53","53","100","100"],
    ["2015.09.06 11:50:58 GMT","2015.09.06 11:51:02 GMT","2015.09.06 11:51:14 GMT","<a href=uig</a>","insert overwrite directory \'\/prod\/mt...layer(Stage","hdfswrite","default","SUCCEEDED","1","1","1","1"],
    ["2015.09.10 02:58:53 GMT","2015.09.10 02:59:01 GMT","2015.09.10 02:59:20 GMT","<a href=man</a>","Lot Operation Mapreduce","hdfswrite","ingest","SUCCEEDED","66","66","1","1"]
]
</script>
        <tbody>
        </tbody>
        <tfoot>
          <tr>
            <th>
              <input class="search_init" type="text" name="submit_time" value="Submit Time">
            </th>
            <th>
              <input class="search_init" type="text" name="start_time" value="Start Time">
            </th>
            <th>
              <input class="search_init" type="text" name="finish_time" value="Finish Time">
            </th>
            <th>
              <input class="search_init" type="text" name="start_time" value="Job ID">
            </th>
            <th>
              <input class="search_init" type="text" name="start_time" value="Name">
            </th>
            <th>
              <input class="search_init" type="text" name="start_time" value="User">
            </th>
            <th>
              <input class="search_init" type="text" name="start_time" value="Queue">
            </th>
            <th>
              <input class="search_init" type="text" name="start_time" value="State">
            </th>
            <th>
              <input class="search_init" type="text" name="start_time" value="Maps Total">
            </th>
            <th>
              <input class="search_init" type="text" name="start_time" value="Maps Completed">
            </th>
            <th>
              <input class="search_init" type="text" name="start_time" value="Reduces Total">
            </th>
            <th>
              <input class="search_init" type="text" name="start_time" value="Reduces Completed">
            </th>
          </tr>
        </tfoot>
      </table>
    </td>
  </tr>
</tbody>

0 个答案:

没有答案