#! perl -w
# tblext.pl
# Process the raw pages into something simple - i.e. usable on the PalmPilot
use strict;
use HTML::TableExtract;
use Data::Dumper;
use File::Find;
use CGI qw/:standard/; # load standard CGI routines
# Some hot handles (not really needed)...
select((select(STDOUT), $| = 1)[0]);
select((select(STDERR), $| = 1)[0]);
# The output files are in the format such as '011213_Thu.html'
find({ wanted => \&process, no_chdir => 1 }, '.');
my %worklist;
sub process
{
return unless -f;
return unless /^\.\/(\d\d\d\d\d\d)_(...)\.html$/i;
$worklist{$1} = [$_, $2, $1];
#~ munge($_, $2, $1);
}
#~ print Dumper(\%worklist);
# write an index...
my $indexfile = "tvindex.html";
print "Creating index file '$indexfile'...\n";
open INDEXFILE, "> $indexfile" or die "unable to open '$indexfile' for write: $!";
print INDEXFILE start_html("TV Index"), h1("TV Index");
foreach(sort keys %worklist){
munge(@{$worklist{$_}});
}
print INDEXFILE end_html();
close INDEXFILE;
sub munge
{
my($filename, $day, $date) = @_;
$date =~ /^(\d\d)(\d\d)(\d\d)$/;
my $nicedate = "$3/$2/$1";
print "Found raw file '$filename' for $day $nicedate.\n";
my $te = new HTML::TableExtract(depth => 1, count => 0);
print "Parsing file...\n";
$te->parse_file($filename) or die "$!";
print "Pulling table info...\n";
my @rowlist;
# Pull out table of interest and write channel into data...
foreach my $ts ($te->table_states) {
my $coord = join(',', $ts->coords);
next unless $coord eq '1,0';
print "Table ($coord):\n";
my $count = 0;
foreach($ts->rows) {
# Take a deep copy of each row and push a reference to it
# onto the list of rows
my @rowcopy = @$_;
push @rowlist, \@rowcopy;
}
last;
}
undef $te; # <-- no longer needed
print "Transforming table info...\n";
print "Data:-\n";
foreach (@rowlist){
shift @$_; #<-- discard first column
foreach(@$_){s/\s+$//} # <-- tidy up the many empty fields
}
#print Dumper(\@rowlist);
# iterate rows, pulling data into column lists
# first do the column headers...
my @hdrs = @{shift @rowlist};
#print Dumper(@hdrs);
my @cols;
foreach my $hdr (@hdrs){
push @cols, [$hdr];
}
my $len = scalar(@rowlist);
#~ print "There are $len rows...\n";
for(my $i = 0; $i < $len; $i++){
# each row is a ref to a list of entries
my $row = $rowlist[$i];
my $ncols = scalar(@$row);
for(my $c = 0; $c < $ncols; $c++){
my $val = $row->[$c];
next unless length $val;
#~ print "$c : $val\n";
push @{$cols[$c]}, $val;
}
}
# @cols is now a list of the table's columns i.e. a list of programs for
# particular channels - the first element is the channel name
#print Dumper(@cols);
my $outfile = $date."_".$day."_out.html";
print "Printing to new html file '$outfile'...\n";
open OUTFILE, "> $outfile"
or die "Can't open $outfile for writing: $!";
print OUTFILE start_html("$day $nicedate"), h1("$day $nicedate");
my $chnum = 1;
foreach (@cols){
my $chan = shift @$_;
print OUTFILE h2("<a id=\"CH$chnum\" name=\"CH$chnum\"></a>$chan");
print INDEXFILE h2("<a href=\"$outfile#CH$chnum\">$day $nicedate $chan</a>");
$chnum++;
foreach (@$_){
my $prog = $_;
$prog =~ s/^(\d+:\d+)\s+/<b>$1<\/b>\n<BR>\n/;
print OUTFILE "$prog\n<BR>\n";
}
unshift @$_, $chan;
}
#~ print Dumper(@cols);
print OUTFILE end_html;
close OUTFILE;
}