#!/usr/bin/perl #----------------------------------------------------------------------------- # # nexus [url|filename] # # http://www.remote.org/jochen/software/nexus/ # #----------------------------------------------------------------------------- # # 2004-01-30 version 0.1 # #----------------------------------------------------------------------------- # # Copyright (C) 2004 Jochen Topf # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA # #----------------------------------------------------------------------------- use strict; my $OUTPUT_VERSION = '0.91'; use XML::RSS; use LWP::Simple; my $rss = new XML::RSS( encoding => 'iso-8859-1', output => $OUTPUT_VERSION); if ($ARGV[0] =~ /^http:/) { my $content = get($ARGV[0]); die("Could not download RSS file from $ARGV[0]\n") unless (defined $content); $rss->parse($content); } else { $rss->parsefile($ARGV[0]); } foreach my $item (@{$rss->{'items'}}) { $item->{'description'} = get_desc($item->{'link'}); } print $rss->as_string, "\n"; #----------------------------------------------------------------------------- sub get_desc { my ($url) = @_; my $content = get($url); die("Could not download item from $url\n") unless (defined $content); return cleanup(autoextract($content)); } #----------------------------------------------------------------------------- sub cleanup { my ($c) = @_; $c =~ s/<\/?table[^>]*>//gi; $c =~ s/<\/?tr[^>]*>//gi; $c =~ s/<\/?td[^>]*>//gi; return $c; } #----------------------------------------------------------------------------- sub get_tag { my ($c, $tag, $se) = @_; my $i; my @found; my $p = 0; while (($i = index($c, $tag, $p)) != -1) { push(@found, [ $i, $se ]); $p = $i+1; } return @found; } #----------------------------------------------------------------------------- sub autoextract { my ($c) = @_; my @found = ( get_tag($c, '', 'end'), get_tag($c, '', 'end') ); my @stack; my @pairs; foreach (sort { $a->[0] <=> $b->[0] } (@found)) { if ($_->[1] eq "start") { push(@stack, $_->[0]); } elsif ($_->[1] eq "end") { push(@pairs, [ pop(@stack), $_->[0] ]); } else { die("Should never be here\n"); } } my $cand; foreach my $pair (sort { $a->[0] <=> $b->[0] } (@pairs)) { my $part = substr($c, $pair->[0], $pair->[1] - $pair->[0]); $part =~ s/ +/ /g; my $len = length($part); (my $wotags = $part) =~ s/<[^>]+>//g; my $tags = length($wotags); my $ratio = $tags/$len; if ($len > 500) { if ($cand) { if ($ratio > $cand->[2]) { $cand = $pair; $cand->[2] = $ratio; } } else { $cand = $pair; $cand->[2] = $ratio; } } } return substr($c, $cand->[0], $cand->[1] - $cand->[0]) . "\n"; } #-- THE END ------------------------------------------------------------------