Extracting Table Data (Perl Cookbook, 2nd Edition)

use HTML::TableContentParser; $tcp = HTML::TableContentParser->new; $tables = $tcp->parse($HTML); foreach $table (@$tables) { @headers = map { $_->{data} } @{ $table->{headers} }; # attributes of table tag available as keys in hash $table_width = $table->{width}; foreach $row (@{ $tables->{rows} }) { # attributes of tr tag available as keys in hash foreach $col (@{ $row->{cols} }) { # attributes of td tag available as keys in hash $data = $col->{data}; } } }

<table width="100%" bgcolor="#ffffff"> <tr> <td>Larry & Gloria</td> <td>Mountain View</td> <td>California</td> </tr> <tr> <td><b>Tom</b></td> <td>Boulder</td> <td>Colorado</td> </tr> <tr> <td>Nathan & Jenine</td> <td>Fort Collins</td> <td>Colorado</td> </tr> </table>

[ { 'width' => '100%', 'bgcolor' => '#ffffff', 'rows' => [ { 'cells' => [ { 'data' => 'Larry & Gloria' }, { 'data' => 'Mountain View' }, { 'data' => 'California' }, ], 'data' => "\n " }, { 'cells' => [ { 'data' => '<b>Tom</b>' }, { 'data' => 'Boulder' }, { 'data' => 'Colorado' }, ], 'data' => "\n " }, { 'cells' => [ { 'data' => 'Nathan & Jenine' }, { 'data' => 'Fort Collins' }, { 'data' => 'Colorado' }, ], 'data' => "\n " } ] } ]

Example 20-11. Dump modules for a particular CPAN author

  #!/usr/bin/perl -w
  # dump-cpan-modules-for-author - display modules a CPAN author owns
  use LWP::Simple;
  use URI;
  use HTML::TableContentParser;
  use HTML::Entities;
  use strict;
  our $URL = shift || 'http://search.cpan.org/author/TOMC/';
  my $tables = get_tables($URL);
  my $modules = $tables->[4];    # 5th table holds module data
  foreach my $r (@{ $modules->{rows} }) {
    my ($module_name, $module_link, $status, $description) = 
        parse_module_row($r, $URL);
    print "$module_name <$module_link>\n\t$status\n\t$description\n\n";
  } 
  sub get_tables {
    my $URL = shift;
    my $page = get($URL);
    my $tcp = new HTML::TableContentParser;
    return $tcp->parse($page);
  }
  sub parse_module_row {
    my ($row, $URL) = @_;
    my ($module_html, $module_link, $module_name, $status, $description);
    # extract cells
    $module_html = $row->{cells}[0]{data};  # link and name in HTML
    $status      = $row->{cells}[1]{data};  # status string and link
    $description = $row->{cells}[2]{data};  # description only
    $status =~ s{<.*?>}{  }g; # naive link removal, works on this simple HTML
    # separate module link and name from html
    ($module_link, $module_name) = $module_html =~ m{href="(.*?)".*?>(.*)<}i;
    $module_link = URI->new_abs($module_link, $URL); # resolve relative links
    # clean up entities and tags
    decode_entities($module_name);
    decode_entities($description);
    return ($module_name, $module_link, $status, $description);
  }

20.19. Extracting Table Data

20.19.1. Problem

20.19.2. Solution

20.19.3. Discussion

Example 20-11. Dump modules for a particular CPAN author

20.19.4. See Also