#!/usr/bin/perl #++ # $Id: rsslurp.pl,v 2.5 2011/07/26 01:57:50 az Exp $ # # File: rsslurp.pl # Date: Fri Jul 23 13:41:11 2004 # Author: Alexander Zangerl (az) # # Abstract: # read bookmarks topicmap, look for particular is-a, # capture the rss feeds and plop them all in a dir # #-- use strict; use XML::Feed; use LWP::UserAgent; use HTTP::Status qw(:constants); use Getopt::Std; use URI::Escape; use URI; use Digest::MD5 qw(md5_hex); use Sys::Syslog qw(openlog syslog setlogsock); use Data::Dumper; use File::Glob ':glob'; use File::Basename; use File::Slurp; my $progname="rsslurp"; my %opts; (!getopts("c:d:vo:s",\%opts) or @ARGV!=1) and die "usage: $0 [-c cssurl] [-d destbase] [-o overviewdoc] [-vs] mapdir\ndestbase is curdir\n-s is for syslogging messages\n"; my $css=$opts{c} || "rssreader.css"; my $destdir=$opts{d} || "."; my $exitcode=0; my $overview=$opts{o} || "overview.html"; # type of topics to check, oc types for source and local dest my $isa="rssfeed"; my $type_s="source"; my $type_d="local"; my $mapdir=$ARGV[0]; my %handled; # must keep separate user agents for ssl and non-ssl connections # NOTE: CANT call env_proxy for ssl-agent, or ssl-connect-proxying fails! (see man crypt::ssleay) my $sua=LWP::UserAgent->new; $sua->agent("Mozilla"); # some sites don't like libwww as useragent my $ua=LWP::UserAgent->new; $ua->agent("Mozilla"); $ua->env_proxy; foreach my $m (sort glob("$mapdir/topics/$isa/instance/*")) { my ($source,$dest,@f,$topic); $topic=basename($m); @f=glob("$m/oc/*:$type_d"); next if (@f!=1); # ignore unworkables $dest=read_file($f[0]); chomp $dest; @f=glob("$m/oc/*:$type_s"); next if (@f!=1); $source=read_file($f[0]); chomp $source; my $duri=URI->new($dest); die "dest $dest for $topic is not local!\n" if ($duri->scheme ne "file"); $dest=$destdir."/".$duri->path; my $cached=$dest.".cache"; my $oldcached=$cached."-old"; my $response=($source =~ /^https:/i?$sua:$ua)->mirror($source,$cached); if (!$response->is_success && $response->code!=HTTP_NOT_MODIFIED) { my $problem=$response->status_line; debug("mirroring of $source failed with $problem\n"); $exitcode=1; next; } # record NOW when this feed has last been covered # used later on to generate overview page $handled{$topic}={"path"=>$duri->path, "mtime"=>(stat($cached))[9]}; if ($response->code == HTTP_NOT_MODIFIED && -r $dest) { debug("feed $topic: no change\n"); next; } elsif (-f $oldcached and !(0xffff && system("diff -q $cached $oldcached >/dev/null"))) { debug("feed $topic: no change (xml)\n"); # record old time as there is no real change $handled{$topic}={"path"=>$duri->path, "mtime"=>(stat($oldcached))[9]}; next; } debug("feed $topic: "); # now update the second cached copy to reflect the newest real version unlink($oldcached); system("cp","-p",$cached,$oldcached); my $content=read_file($cached); my $result=rss2html($content,$css); if (!$result) { $exitcode=1; unlink $dest,$cached,$oldcached; delete $handled{$topic}; } else { # do a sanity check on the actual data: different output? # some blogs have ever-changing xml... my $previous; open(F,"<:encoding(utf8)",$dest); while () { $previous.=$_; } close F; if ($previous eq $result) { debug("no change (content)\n"); $handled{$topic}->{mtime}=(stat($dest))[9]; } else { debug("saving\n"); # output utf8 open(F,">:encoding(utf8)",$dest) or die "can't open >$dest: $!\n"; print F $result; close F; } } } # produce overview page open(F,">$destdir/$overview") or die "can't write to $destdir/$overview: $!\n"; print F qq| RSS - Overview|; my $lastdate; my $colour=0; foreach (sort { $handled{$b}->{mtime}<=>$handled{$a}->{mtime} } keys %handled) { my $thisdate=sprintf("%d-%.02d-%.02d", (localtime($handled{$_}->{mtime}))[5,4,3]); my ($flink,$fdate)=($handled{$_}->{path}, scalar localtime($handled{$_}->{mtime})); $colour++ if ($lastdate && $lastdate ne $thisdate && $colour<10); $lastdate=$thisdate; print F qq|

$_

$fdate

|; } print F qq||; close F; exit $exitcode; # takes rss text, returns htmlifiied sub rss2html { my ($content,$cssurl)=@_; my $feed; eval { $feed=XML::Feed->parse(\$content); }; if ($@ || !$feed) { debug("parsing failed: $@\n"); return undef; } my $ftitle=$feed->title; my $fdesc=$feed->description; my $flink=$feed->link; my $result= qq| RSS - $ftitle |; $result.= qq|

$ftitle

|; $result .= qq|

$fdesc

| if ($fdesc=~/\S/); $result .= qq|
|; # print the channel items my $c=1; for my $item ($feed->entries) { my $title=$item->title; my $link=$item->link; my $date=$item->issued||$item->modified; $date=localtime($date->epoch) if ($date); my $desc=$item->content; $desc=$item->summary if (!$desc); $desc=$desc->body if ($desc); # use the time for linking if there's no title if ($date && !$title) { $title=$date; undef $date; } $result.= qq|

$c $title  $date

|; $result.=qq|
$desc
| if ($desc=~/\S/); $result.=qq|
|; $c++; } $result .= qq||; return $result; } sub debug { if ($opts{"v"}) { if ($opts{"s"}) { setlogsock("unix"); openlog($progname,"nowait","daemon"); syslog("info",@_); } else { print STDERR @_; } } }