# ----------------------------------------------------------------------------------------------- # Author: Dean Stringer # Description: # This script is intended to be run from the command-line # and reads STDIN for a list of HTML filenames, each of which # it parses looking for MetaData and links. # ----------------------------------------------------------------------------------------------- use strict; use HTML::TokeParser; my $totalFiles = 0; my $showLinks = 0; my $lineBreak = "-----------------------------------------------------------------------------------\n"; print "\n\n" . $lineBreak . "Note: See end of output for Summary\n" . $lineBreak; while (my $sourceFile=) { chop $sourceFile; my $content; # container for file contents if (-e $sourceFile) { local($/) = undef; open(IN, '<'.$sourceFile) or dieNice ("Can't open include file [ $sourceFile ]\n"); $content = ; close IN; } else { dieNice ("File '$sourceFile' does not exist."); } $totalFiles++; # Show filename print "\n\n" . $lineBreak . "File: $sourceFile\n" . $lineBreak; # Get the Title tag print "Title: " . parseTag($content, 'title') . "\n"; # Get a META tag # my $description = parseMeta($content, 'keywords'); # $pageBody .= "
Description is $description"; # Get ALL META Data values my %metaValues = parseAllMeta($content); my ($key, $value); print "Meta Data:\n"; while (($key,$value) = each %metaValues) { # replace CRs with Newlines $value =~ s/\r/\r\n/g; # remove white-space $value =~ s/\s{2,}/\s/g; print "\t" . ucfirst($key) . ":" . $value . "\n"; } if ($showLinks) { my %links = parseLinks($content); print "Links to:"; while (($key,$value) = each %links) { print "\n\t$value"; } } } print "\n\n\n" . $lineBreak . "Summary:\n\n\tFiles Read: " . $totalFiles . "\n" . $lineBreak; exit; sub parseTag { # -------------------------------------------------------------- # Grab just the Title # -------------------------------------------------------------- my $webPage = shift; my $tag2Get = shift; my $parser = HTML::TokeParser->new(\$webPage); $parser->get_tag($tag2Get); # Grab the next tag return $parser->get_trimmed_text; # Trim off the white space } sub parseMeta { # -------------------------------------------------------------- # Grab a specific META tag # -------------------------------------------------------------- my $webPage = shift; my $tag2Get = shift; my $tagValue = ''; my $parser = HTML::TokeParser->new(\$webPage); while (my $token=$parser->get_tag("meta")) { if ($token->[1]{name}=~/$tag2Get/i) { $tagValue = $token->[1]{content}; } } return $tagValue; } sub parseAllMeta { # -------------------------------------------------------------- # Grab ALL META tags and return as a Hash # Much quicker than creating a new instance each # time (as in parseMeta) for each Meta element. # -------------------------------------------------------------- my $webPage = shift; my $tag2Get = shift; my %metaValues; my $parser = HTML::TokeParser->new(\$webPage); while (my $token=$parser->get_tag("meta")) { # Make sure the Meta item has a 'name' attribute, otherwise it # could be another HTTP header fiddle like HTTP-EQUIV or someit. if ($token->[1]{name} ne "") { $metaValues{$token->[1]{name}} = $token->[1]{content}; } } return %metaValues; } sub parseLinks { # -------------------------------------------------------------- # Grab ALL META tags and return as a Hash # Much quicker than creating a new instance each # time (as in parseMeta) for each Meta element. # -------------------------------------------------------------- my $webPage = shift; my (%metaValues, $label, $URL); my $parser = HTML::TokeParser->new(\$webPage); while (my $token=$parser->get_tag("a")) { $label = $parser->get_trimmed_text; # Trim off the white space $URL = $token->[1]{href}; if ($URL ne "") { $metaValues{$label} = $URL; } } return %metaValues; } sub dieNice { # -------------------------------------------------------------- # Bomb out with a supplied error message and show syntax # -------------------------------------------------------------- my $errorMessage = shift; print "\nERROR !!! $errorMessage\n"; print "Syntax is : metageta.pl \n"; exit; }