#!/usr/bin/perl -w # # suckdot-get: # Retrieve the Slashdot front page, filter out the links, and emit a single # HTML and RSS page. # # Copyright (c) 2003 Chris Lightfoot. All rights reserved. # Email: chris@ex-parrot.com; WWW: http://www.ex-parrot.com/~chris/ # my $rcsid = ''; $rcsid .= '$Id: suckdot-get,v 1.2 2003/09/01 16:24:19 chris Exp $'; use strict; use Error qw(:try); use LWP::Simple; use HTML::Entities; use IO::File; use IO::Pipe; my $htmlfile = '/home/chris/public_html/suckdot/index.html'; my $rssfile = '/home/chris/public_html/suckdot/rss.xml'; sub get_slashdot_front_page () { return get('http://slashdot.org/'); } my $text = get_slashdot_front_page() or die "can't retrieve Slashdot"; # We want to retrieve links which are parts of `stories'. These start `Posted # by [someone]' and end `Read more...'. my @ss = ($text =~ m#(Posted\s+by.+?Read\s+more)#sig); die "too few stories found" unless (@ss > 5); my @storylinks = ( ); my $N = 0; foreach my $story (@ss) { $story =~ s#^.+?@\d\d:\d\d[AP]M##s; # Remove any `... writes' bit. $story =~ s#^.+?writes ##si; my @links = ( ); foreach my $a ($story =~ m#(.+?)#gsi) { my ($url, $text) = ($a =~ m#(.+?)#si); # Try to turn things into entities, if they're not already. $text =~ s/<[^>]+>//g; $text =~ s/&(?![^\s]{0,5};)/&/g; $text =~ s//>/g; push(@links, [$url, $text]); } push(@storylinks, [@links]) if (@links > 0); } #die "too few stories with links found" unless (@storylinks > 5); # # HTML version # my $html_head = < Suckdot

Suckdot

This is a collection of the links from Slashdot stories, the only possibly useful content from that site. Alternatively, use the RSS version.

    EOF my $html_tail = <

    Copyright © Slashdot contributors. Software copyright © 2003 Chris Lightfoot.

    EOF my $f = new IO::File("$htmlfile.new", O_WRONLY|O_CREAT|O_TRUNC) or die "html file: open: $!"; $f->print($html_head, join("", map { "
    • " . join("", map { sprintf(q(
    • %s
    • ), $_->[0], $_->[1]) } @$_) . "
    " } @storylinks), $html_tail) or die "html file: write: $!"; $f->close() or die "html file: close: $!"; chmod(0644, "$htmlfile.new") or die "html file: chmod: $!"; rename("$htmlfile.new", $htmlfile) or die "html file: rename: $!"; # # RSS version # my $rss_head = < Suckdot: cutting the crap from Slashdot http://ex-parrot.com/~chris/suckdot/ The links from Slashdot, with none of the "comment" copyright © Slashdot contributors suckdot-get http://backend.userland.com/rss 15 EOF my $rss_tail = < EOF $f = new IO::Pipe() or die "rss file: pipe: $!"; $f->writer("/bin/sh", "-c", "iconv -f cp1252 -t utf-8 > $rssfile.new") or die "rss file: exec: $!"; $f->print($rss_head, join("", map { join("", map { sprintf(q(%s%s), encode_entities($_->[0]), $_->[1]) } @$_) # encode_entities is wrong but we have to rely on Slashdot's brokenness } @storylinks), $rss_tail) or die "rss file: write: $!"; $f->close() or die "rss file: close: $!"; # apparently IO::Pipe automatically reaps child processes. Whatever. #print STDERR wait, " $!\n"; # #my $exitcode = $? >> 8; #die "rss file: iconv failed ($exitcode)" if (0 != $exitcode); chmod(0644, "$rssfile.new") or die "rss file: chmod: $!"; rename("$rssfile.new", $rssfile) or die "rss file: rename: $!"; # success