URLとXPathを渡すと値のリストを返してくれるツールを書いた

jottitのほうに公式ページをつくりました。

画像一覧からXPathでぶっこぬいてダウンローダにわたす、といった用途から簡単なマイニングまでべんりにつかう予定。

つかいかた

% ./exthtml.pl [ -a [AGENT] -e [REFERER] -c [COOKIE_JAR] ] -x [XPATH] [URL]
% perl exthtml.pl -a Mozilla -c ~/Library/Cookies/Cookies.plist -X "//img[@class='image']/@src" http://www.tumblr.com/dashboard
http://media.tumblr.com/XEVQcg9Zzaj7driiTV8baCRJ_400.gif
http://media.tumblr.com/G6je7UvklaklnqmgypCVpgBg_100.jpg
http://media.tumblr.com/G6je7UvklaklnqmgypCVpgBg_400.jpg
http://media.tumblr.com/Bcb4YCrxh3s3i9vsNiAe7H0C_100.jpg
http://media.tumblr.com/Bcb4YCrxh3s3i9vsNiAe7H0C_400.jpg
http://media.tumblr.com/cqRYWOkYRambcyqr1PmewR2r_100.jpg
http://media.tumblr.com/cqRYWOkYRambcyqr1PmewR2r_400.jpg
http://media.tumblr.com/vj8toQRftakozppfgR7H0j40_400.jpg
http://media.tumblr.com/51bAnBJJUamcm3iu9QVSYb2t_100.png
http://media.tumblr.com/51bAnBJJUamcm3iu9QVSYb2t_400.png
% perl exthtml.pl -a Mozilla -c ~/Library/Cookies/Cookies.plist -X "//table[@id='timeline']//img[@class='photo fn']/@alt" http://twitter.com/ | sort | uniq -c
   1 Tadaki Osawa
   1 Vol.2%
   2 Yusuke Yanbe
   1 hanemimi
   2 log_070702
   1 oklahomamixer
   2 totoounk皇子
   2 ☆ギンギンギラギラギンギンのもりひろ☆
   2 オリハタ
   3 ヤマタケ
   1 小池 陸
   1 岸田渉
   1 星村

code

#!/usr/bin/perl
use strict;
use warnings;

use Encode;
use Getopt::Long;

use LWP::UserAgent;
use HTTP::Cookies::Guess;

my $url = pop @ARGV;

my ($xpath, $referer, $cookie, $agent);
my $result = GetOptions(
    "x|xpath=s" => \$xpath,
    "e|referer=s" => \$referer,
    "c|cookie-jar=s" => \$cookie,
    "a|agent=s" => \$agent,
);

die "usage: ./exthtml.pl [ -a [AGENT] -e [REFERER] -c [COOKIE_JAR] ] -x [XPATH] [URL]" unless ($url && $xpath);

$xpath = decode('utf-8', $xpath);

my $ua = LWP::UserAgent->new;
$ua->cookie_jar(HTTP::Cookies::Guess->create(file => $cookie)) if ($cookie);
$ua->agent($agent) if ($agent);

my $tree = HTML::TreeBuilder::XPath::Remote->new_from_uri($url, $ua, $referer);
for my $node ($tree->findnodes($xpath)) {
    print encode('utf-8', $node->getValue."\n");
}

package HTML::TreeBuilder::XPath::Remote;
use strict;
use warnings;

use List::Util qw( first );

use Encode;
require Encode::Detect;

use HTML::TreeBuilder::XPath;
use HTML::ResolveLink;

use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response::Encoding;

sub new_from_uri {
    my ($pkg, $uri, $ua, $referer) = @_;
    
    my $resolver = HTML::ResolveLink->new(
        base => $uri,
    );
    
    return HTML::TreeBuilder::XPath->new_from_content(
        $resolver->resolve(
            $pkg->get($uri, $ua, $referer)
        )
    );
}

sub get {
    my ($self, $uri, $ua, $referer) = @_;
    
    my $html;
    
    $ua ||= LWP::UserAgent->new();
    my $req = HTTP::Request->new('GET', $uri);
    $req->header(referer => $referer) if ($referer);
    
    my $res = $ua->request($req);
    
    # this detection is based on Web::Scraper.
    if ($res->is_success) {
        my @encoding = (
            $res->encoding,
            ($res->header('Content-Type') =~ /charset=([\w\-]+)/g),
            "Detect",
            "shift-jis"
        );
        my $encoding = first {
            defined $_ && Encode::find_encoding($_)
        } @encoding;
        
        $html = Encode::decode($encoding, $res->content);
        return $html;
    }
    return;
}