# --------------------------------------------------------------------------------- # Author: Dean Stringer (deeknow @ pobox.com) # Description/Purpose: # # Extracts newsgroup thread titles from a group specified on the command-line # and will optionally save new posts (header and body) to a local XML file. # (this XML file can then be post-processed using XSL into other formats) # # Generates a index file (XML) of posts found for the given newsgroup and # can optionally mark read posts as read so they dont need to be fetched # again (using a standard Newsrc.txt file) # # ....WHAT IT DOESNT DO.... # # This isnt a fully-fledged newsreader, its intended to be used as a digest # builder of new topics (ie ignoring replies (which constitute the majority # of most group posts). Therefore, features that arent supported are... # - handle multipart posts # - handle attachments/binaries # # Possible extensions: # - personal 'newsrc' files so individuals can configure their # own group subscriptions # - common/group filters to block spammers/noise/idiots # - cacheing of threads, temporary storage of whole thread and # periodic purging # # Dependencies: (Perl Modules) # Getopt::Std -> Parsing command-line options # XML::Writer -> Saving post bodies # News::Newsrc -> Managing read posts # Net::NNTP -> Connections to NNTP server and retireval of posts # HTML::Entities -> Encode special entities in the body (otherwise XML will not be well-formed) # # Expected Parameters: # See the $syntax variable contents following # # Returns: # Default output (unless supressed with 'q' switch) is to list a summary of post # totals, and titles of posts actually processed # # Sample Invocation: # perl ./nntp.pl -g alt.www.webmaster -s -r -q -m # ( read all posts in 'alt.www.webmaster', including 'r'ead ones, # 's'aving and 'm'arking as read and running in 'q'uiet mode ) # # Error situations: # newsgroup doesnt exist - error message displayed # unable to fetch message/NNTP server down - not handled .. yet # # See Also: # CPAN modules # http://search.cpan.org/search?dist=News-Newsrc # http://search.cpan.org/doc/GBARR/libnet-1.11/Net/NNTP.pm # # RFC977 RFC # http://www.faqs.org/rfcs/rfc977.html # # Command-line news-readers # Tin: http://duff-5.ucs.ualberta.ca/CNS/HELP/news/tin2.html # --------------------------------------------------------------------------------- # Revisions: # - REMd out Win32::ODBC connection out, will eventually use MySQL # - use strict'ed (loads of changes for this :-) # - dropped getHeader, getArticle and getBody subs as all support by method calls # - changed to XML::Writer instead of XML::DOM (lighter-weight/faster) # - added HTML::Entities to encode odd characters to make XML well-formed # - added use of Getopt::Std for command-line option parsing # - changed to process one group at a time only # - added 'x' option for arbitrary commands like 'markall' # --------------------------------------------------------------------------------- use strict 'vars'; use Getopt::Std; use Net::NNTP; # a class implementing a simple NNTP client in Perl as described in RFC977. use News::Newsrc; # Use this to manage article 'mark'ing so we don't # have to read all articles each time, and we don't # have to manage which article number we last read use XML::Writer; use HTML::Entities; # have to use this to encode unknown entities if present in the usenet post, # if we dont encode, then the XML::Writer module will create a non-well-formed # XML file. # --------------------------------------------------------------------------------- my %switchVal; getopt('ngox', \%switchVal); my $syntax = < are.... -d Debug mode (show module diagnostics and NNTP errors) -g Group Name (e.g. 'alt.www.webmaster') -m Mark as read -n Number (max) of Posts to read -o Output format of progress/summary, values 'text' (default) or 'xml' -q Quiet (supress progress output) show errors only -s Save posts (as XML) -r Process Replies (default is original/new posts only) -v Verbose output - echo post headers -x eXecute special functions.. 'markall' Mark all as read 'unmarkall' UnMark all as read END unless ($switchVal{g}) { # Require a group name to fetch die "\nPlease provide a (g)roupname to fetch. $syntax"; } my $group = $switchVal{g}; my $markAsRead = ($switchVal{m}) ? 1 : 0; my $default2Process = 10; # default number of posts to process my $max2Process = ($switchVal{n}) ? $switchVal{n} : $default2Process; my $hostname = 'news.mydomain.com'; my $newsFile = 'newsrc.txt'; my $postFileExt = '.txt'; my $dbaseName = 'usenet'; # used by the DNS connection my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time); my $runDate = "$mday/" . ($mon + 1) . "/" . ($year + 1900); my $outputMode = 'text'; if ($switchVal{o} =~ /(xml)/) { $outputMode = "xml";} my $savePath = ""; # --------------------------------------------------------------------------------- # Open up the 'newsrc' file and generate a list of un-marked articles # --------------------------------------------------------------------------------- my $newsrc = News::Newsrc->new; $newsrc->load($newsFile); # --------------------------------------------------------------------------------- # Connect to News host # --------------------------------------------------------------------------------- my $opt_debug = 0; # Enable the printing of debugging information to STDERR my $opt_timeout = 30; # default is 120secs my $nntp = Net::NNTP->new($hostname, Timeout => $opt_timeout, Debug => $switchVal{d} ? 1 : 0); # --------------------------------------------------------------------------------- # Check if we're invoking a special 'x' command # --------------------------------------------------------------------------------- if (lc($switchVal{x}) eq 'markall') { my ($numberOfArticles, $lowID, $highID) = $nntp->group($group) or die "ERROR: No such group '$group'" . $syntax; $newsrc->mark_range($group, $lowID, $highID); $newsrc->save; print "Marked All articles in $group.\nLowId was $lowID and HighID was $highID"; } elsif (lc($switchVal{x}) eq 'unmarkall') { my ($numberOfArticles, $lowID, $highID) = $nntp->group($group) or die "ERROR: No such group '$group'" . $syntax; $newsrc->unmark_range($group, $lowID, $highID); $newsrc->save; print "UnMarked All articles in $group.\nLowId was $lowID and HighID was $highID"; # --------------------------------------------------------------------------------- # Default is to process given group normally (ie read posts) # --------------------------------------------------------------------------------- } else { processGroup($group); if ($markAsRead) { $newsrc->save;} } $nntp->quit; exit; # ================================================================================================ # END OF MAIN START OF SUBS # ================================================================================================ sub processGroup { # --------------------------------------------------------------------------------- # Process a indicated group using the command-line paramaters # --------------------------------------------------------------------------------- my $group = shift; $savePath = "posts/" . $group; # --------------------------------------------------------------------------------- # Bomb out if we've been asked to save posts but the group folder doesnt exists # or isn't writable # --------------------------------------------------------------------------------- if ($switchVal{s}) { unless (-d $savePath) { die "\nERROR: No group folder found to write to." . $syntax; } unless (-w $savePath) { die "\nERROR: Can't write posts to group folder." . $syntax; } } # --------------------------------------------------------------------------------- # grab the total article number range and count # --------------------------------------------------------------------------------- my ($numberOfArticles, $lowID, $highID) = $nntp->group($group) or die "\nERROR: No such group '$group'" . $syntax; my (@articleNumbers, @unMarkedPostNumbers, @markedPostNumbers); my ($unMarkedCount, $markedCount); # --------------------------------------------------------------------------------- # Get the actual read/unread post lists from 'newsrc.txt' # --------------------------------------------------------------------------------- @unMarkedPostNumbers = $newsrc->unmarked_articles($group, $lowID, $highID); $unMarkedCount = @unMarkedPostNumbers; @markedPostNumbers = $newsrc->marked_articles($group, $lowID, $highID); $markedCount = @markedPostNumbers; # --------------------------------------------------------------------------------- # Show Summary / Post Couunt depending on whether were running in TXT or XML output # --------------------------------------------------------------------------------- my $output; unless ($switchVal{q}) { if ($outputMode eq 'xml') { $output = new XML::Writer; $output->startTag("posts", group => "$group", host => "$hostname", report_date => "$runDate", low_article_id => "$lowID", hi_article_id => "$highID", total_articles => "$numberOfArticles", articles_unmarked => "$unMarkedCount", articles_marked => "$markedCount" ); } else { print "\n" . "Group: $group" . "\n" . "Retrieval Date: $runDate" . "\n" . "Low Article number: $lowID" . "\n" . "High article number: $highID" . "\n" . "Total: $numberOfArticles" . "\n" . "UnMarked (new) Articles: $unMarkedCount" . "\n" . "Marked (old) Articles: $markedCount" . "\n\n" . "Posts fetched...\n\n"; } } my $processedItemsCount = 0; @articleNumbers = @unMarkedPostNumbers; # --------------------------------------------------------------------------------- # Now, finally, process the posts... # --------------------------------------------------------------------------------- for my $articleNum (@articleNumbers) { my $thisHeader = $nntp->head($articleNum); my %splitHeaders = parseHeaders($thisHeader); # --------------------------------------------------------------------------------- # Only process original posts, unless have been invoked with 'r' (replies) switch # A post is considered original if no references header value present # --------------------------------------------------------------------------------- if (($splitHeaders{References} eq "") || $switchVal{r}){ # --------------------------------------------------------------------------------- # Dont output anything if were in (q)uiet mode # --------------------------------------------------------------------------------- unless ($switchVal{q}) { if ($outputMode eq 'xml') { $output->startTag("post", "subject" => encode_entities($splitHeaders{Subject}), "message-id" => deBracket($splitHeaders{'Message-ID'}) ); $output->endTag("post"); } else { print "$splitHeaders{'Subject'}\n"; # \t[$splitHeaders{'Message-ID'}]\n"; } } # --------------------------------------------------------------------------------- # Save the post (as XML) if we're in (s)ave mode # --------------------------------------------------------------------------------- if ($switchVal{s}) { saveAsXML($nntp->body($articleNum), $articleNum, $savePath, %splitHeaders); } } if ($markAsRead) { $newsrc->mark($group, $articleNum); } $processedItemsCount++; last if ($processedItemsCount == $max2Process); } if ($outputMode eq 'xml') { $output->endTag("posts"); $output->end(); } else { unless ($switchVal{q}) { print "\nFinished!!!" }; } } sub deBracket { # ------------------------------------------------------------------------------------------------ # NNTP agents/servers wrap Message-IDs and References in pairs of '<' and '>' to delimit # one from another when chaining them together. I'm using this sub to remove these when we # want just the text without the delimeter (e.g. when writing to an XML file # ------------------------------------------------------------------------------------------------ my $thisString = shift; if ($thisString =~ /^<(.*)>$/) { $thisString = $1; } return $thisString; } sub saveAsXML { # ------------------------------------------------------------------------------------------------ # Take a message ID and write its header and body to an external XML file # ------------------------------------------------------------------------------------------------ use IO; my ($thisBody, $articleNum, $savePath, %splitHeaders) = @_; my $outputFile = $savePath . "/$articleNum.xml"; my $output = new IO::File(">$outputFile") || die("Cant open output file"); my $writer = new XML::Writer(OUTPUT => $output); $writer->startTag("post", id => $articleNum, host => $hostname); $writer->startTag("head"); $writer->startTag("subject"); $writer->characters(encode_entities($splitHeaders{"Subject"})); $writer->endTag("subject"); $writer->startTag("from"); $writer->characters($splitHeaders{"From"}); $writer->endTag("from"); $writer->startTag("date"); $writer->characters($splitHeaders{"Date"}); $writer->endTag("date"); $writer->startTag("newsgroups"); $writer->characters($splitHeaders{"Newsgroups"}); $writer->endTag("newsgroups"); $writer->startTag("message-id"); $writer->characters(deBracket($splitHeaders{"Message-ID"})); $writer->endTag("message-id"); $writer->startTag("path"); $writer->characters($splitHeaders{"Path"}); $writer->endTag("path"); $writer->startTag("lines"); $writer->characters($splitHeaders{"Lines"}); $writer->endTag("lines"); $writer->startTag("references"); $writer->characters($splitHeaders{"References"}); $writer->endTag("references"); $writer->endTag("head"); $writer->startTag("body"); my $thisBodyString = ""; foreach my $thisLine (@$thisBody) { $thisBodyString .= $thisLine } $writer->characters(encode_entities($thisBodyString)); $writer->endTag("body"); $writer->endTag("post"); $writer->end(); } sub parseHeaders { # ------------------------------------------------------------------------------------------------ # parses all lines in a post header and returns a hash-array using the header name as a key # ------------------------------------------------------------------------------------------------ my $thisHeader = shift; my $lookingForField = shift; my %headers; foreach my $thisLine (@$thisHeader) { if ($thisLine =~ /([\w\-]*):\s(.*)/ig) { $headers{$1}=$2; if ($switchVal{v}) { print "\n$1: $2"; } } } if ($switchVal{v}) { print "\n"; } return %headers; }