#!/usr/bin/perl -w
#
# suckdot-get:
# Retrieve the Slashdot front page, filter out the links, and emit a single
# HTML and RSS page.
#
# Copyright (c) 2003 Chris Lightfoot. All rights reserved.
# Email: chris@ex-parrot.com; WWW: http://www.ex-parrot.com/~chris/
#
my $rcsid = ''; $rcsid .= '$Id: suckdot-get,v 1.2 2003/09/01 16:24:19 chris Exp $';
use strict;
use Error qw(:try);
use LWP::Simple;
use HTML::Entities;
use IO::File;
use IO::Pipe;
my $htmlfile = '/home/chris/public_html/suckdot/index.html';
my $rssfile = '/home/chris/public_html/suckdot/rss.xml';
sub get_slashdot_front_page () {
return get('http://slashdot.org/');
}
my $text = get_slashdot_front_page() or die "can't retrieve Slashdot";
# We want to retrieve links which are parts of `stories'. These start `Posted
# by [someone]' and end `Read more...'.
my @ss = ($text =~ m#(Posted\s+by.+?Read\s+more)#sig);
die "too few stories found" unless (@ss > 5);
my @storylinks = ( );
my $N = 0;
foreach my $story (@ss) {
$story =~ s#^.+?@\d\d:\d\d[AP]M##s;
# Remove any `... writes' bit.
$story =~ s#^.+?writes ##si;
my @links = ( );
foreach my $a ($story =~ m#(.+?)#gsi) {
my ($url, $text) = ($a =~ m#(.+?)#si);
# Try to turn things into entities, if they're not already.
$text =~ s/<[^>]+>//g;
$text =~ s/&(?![^\s]{0,5};)/&/g;
$text =~ s/</g;
$text =~ s/>/>/g;
push(@links, [$url, $text]);
}
push(@storylinks, [@links]) if (@links > 0);
}
#die "too few stories with links found" unless (@storylinks > 5);
#
# HTML version
#
my $html_head = <
Suckdot
Suckdot
This is a collection of the links from
Slashdot stories, the only possibly useful
content from that site. Alternatively, use the RSS
version.