#! perl -w
# tblext.pl
# Process the raw pages into something simple - i.e. usable on the PalmPilot
use strict;
use HTML::TableExtract;
use Data::Dumper;
use File::Find;
use CGI qw/:standard/;           # load standard CGI routines
# Some hot handles (not really needed)...
select((select(STDOUT), $| = 1)[0]);
select((select(STDERR), $| = 1)[0]);

# The output files are in the format such as '011213_Thu.html'
find({ wanted => \&process, no_chdir => 1 }, '.');
my %worklist;
sub process
{
    return unless -f;
    return unless /^\.\/(\d\d\d\d\d\d)_(...)\.html$/i;
    $worklist{$1} = [$_, $2, $1];
    #~ munge($_, $2, $1);
}
#~ print Dumper(\%worklist);
# write an index...
my $indexfile = "tvindex.html";
print "Creating index file '$indexfile'...\n";
open INDEXFILE, "> $indexfile" or die "unable to open '$indexfile' for write: $!";
print INDEXFILE start_html("TV Index"), h1("TV Index");
foreach(sort keys %worklist){
    munge(@{$worklist{$_}});
}
print INDEXFILE end_html();
close INDEXFILE;
sub munge
{
    my($filename, $day, $date) = @_;
    $date =~ /^(\d\d)(\d\d)(\d\d)$/;
    my $nicedate = "$3/$2/$1";
    print "Found raw file '$filename' for $day $nicedate.\n";
    
    my $te = new HTML::TableExtract(depth => 1, count => 0);
    
    print "Parsing file...\n";
    $te->parse_file($filename) or die "$!";
    
    print "Pulling table info...\n";
    my @rowlist;
    # Pull out table of interest and write channel into data...
    foreach my $ts ($te->table_states) {
        my $coord = join(',', $ts->coords);
        next unless $coord eq '1,0';
        print "Table ($coord):\n";
        my $count = 0;
        foreach($ts->rows) {
            # Take a deep copy of each row and push a reference to it
            # onto the list of rows
            my @rowcopy = @$_;
            push @rowlist, \@rowcopy;
        }
        last;
    }
    undef $te; # <-- no longer needed
    
    print "Transforming table info...\n";
    
    print "Data:-\n";
    foreach (@rowlist){
        shift @$_; #<-- discard first column
        foreach(@$_){s/\s+$//} # <-- tidy up the many empty fields
    }
    #print Dumper(\@rowlist);
    
    # iterate rows, pulling data into column lists
    # first do the column headers...
    my @hdrs = @{shift @rowlist};
    #print Dumper(@hdrs);
    my @cols;
    foreach my $hdr (@hdrs){
        push @cols, [$hdr];
    }
    
    my $len = scalar(@rowlist);
    #~ print "There are $len rows...\n";
    for(my $i = 0; $i < $len; $i++){
        # each row is a ref to a list of entries
        my $row = $rowlist[$i];
        my $ncols = scalar(@$row);
        for(my $c = 0; $c < $ncols; $c++){
            my $val = $row->[$c];
            next unless length $val;
            #~ print "$c : $val\n";
            push @{$cols[$c]}, $val;
        }
    }
    
    # @cols is now a list of the table's columns i.e. a list of programs for
    # particular channels - the first element is the channel name
    #print Dumper(@cols);
    my $outfile = $date."_".$day."_out.html";
    print "Printing to new html file '$outfile'...\n";
    open OUTFILE, "> $outfile"
        or die "Can't open $outfile for writing: $!";

    print OUTFILE start_html("$day $nicedate"), h1("$day $nicedate");
    my $chnum = 1;
    foreach (@cols){
        my $chan = shift @$_;
        print OUTFILE h2("<a id=\"CH$chnum\" name=\"CH$chnum\"></a>$chan");
        print INDEXFILE h2("<a href=\"$outfile#CH$chnum\">$day $nicedate $chan</a>");
        $chnum++;
        foreach (@$_){
            my $prog = $_;
            $prog =~ s/^(\d+:\d+)\s+/<b>$1<\/b>\n<BR>\n/;
            print OUTFILE "$prog\n<BR>\n";
        }
        unshift @$_, $chan;
    }
    #~ print Dumper(@cols);
    print OUTFILE end_html;
    close OUTFILE;
}