URLとXPathを渡すと値のリストを返してくれるツールを書いた
画像一覧からXPathでぶっこぬいてダウンローダにわたす、といった用途から簡単なマイニングまでべんりにつかう予定。
つかいかた
% ./exthtml.pl [ -a [AGENT] -e [REFERER] -c [COOKIE_JAR] ] -x [XPATH] [URL] % perl exthtml.pl -a Mozilla -c ~/Library/Cookies/Cookies.plist -X "//img[@class='image']/@src" http://www.tumblr.com/dashboard http://media.tumblr.com/XEVQcg9Zzaj7driiTV8baCRJ_400.gif http://media.tumblr.com/G6je7UvklaklnqmgypCVpgBg_100.jpg http://media.tumblr.com/G6je7UvklaklnqmgypCVpgBg_400.jpg http://media.tumblr.com/Bcb4YCrxh3s3i9vsNiAe7H0C_100.jpg http://media.tumblr.com/Bcb4YCrxh3s3i9vsNiAe7H0C_400.jpg http://media.tumblr.com/cqRYWOkYRambcyqr1PmewR2r_100.jpg http://media.tumblr.com/cqRYWOkYRambcyqr1PmewR2r_400.jpg http://media.tumblr.com/vj8toQRftakozppfgR7H0j40_400.jpg http://media.tumblr.com/51bAnBJJUamcm3iu9QVSYb2t_100.png http://media.tumblr.com/51bAnBJJUamcm3iu9QVSYb2t_400.png % perl exthtml.pl -a Mozilla -c ~/Library/Cookies/Cookies.plist -X "//table[@id='timeline']//img[@class='photo fn']/@alt" http://twitter.com/ | sort | uniq -c 1 Tadaki Osawa 1 Vol.2% 2 Yusuke Yanbe 1 hanemimi 2 log_070702 1 oklahomamixer 2 totoounk皇子 2 ☆ギンギンギラギラギンギンのもりひろ☆ 2 オリハタ 3 ヤマタケ 1 小池 陸 1 岸田渉 1 星村
code
#!/usr/bin/perl use strict; use warnings; use Encode; use Getopt::Long; use LWP::UserAgent; use HTTP::Cookies::Guess; my $url = pop @ARGV; my ($xpath, $referer, $cookie, $agent); my $result = GetOptions( "x|xpath=s" => \$xpath, "e|referer=s" => \$referer, "c|cookie-jar=s" => \$cookie, "a|agent=s" => \$agent, ); die "usage: ./exthtml.pl [ -a [AGENT] -e [REFERER] -c [COOKIE_JAR] ] -x [XPATH] [URL]" unless ($url && $xpath); $xpath = decode('utf-8', $xpath); my $ua = LWP::UserAgent->new; $ua->cookie_jar(HTTP::Cookies::Guess->create(file => $cookie)) if ($cookie); $ua->agent($agent) if ($agent); my $tree = HTML::TreeBuilder::XPath::Remote->new_from_uri($url, $ua, $referer); for my $node ($tree->findnodes($xpath)) { print encode('utf-8', $node->getValue."\n"); } package HTML::TreeBuilder::XPath::Remote; use strict; use warnings; use List::Util qw( first ); use Encode; require Encode::Detect; use HTML::TreeBuilder::XPath; use HTML::ResolveLink; use LWP::UserAgent; use HTTP::Request; use HTTP::Response::Encoding; sub new_from_uri { my ($pkg, $uri, $ua, $referer) = @_; my $resolver = HTML::ResolveLink->new( base => $uri, ); return HTML::TreeBuilder::XPath->new_from_content( $resolver->resolve( $pkg->get($uri, $ua, $referer) ) ); } sub get { my ($self, $uri, $ua, $referer) = @_; my $html; $ua ||= LWP::UserAgent->new(); my $req = HTTP::Request->new('GET', $uri); $req->header(referer => $referer) if ($referer); my $res = $ua->request($req); # this detection is based on Web::Scraper. if ($res->is_success) { my @encoding = ( $res->encoding, ($res->header('Content-Type') =~ /charset=([\w\-]+)/g), "Detect", "shift-jis" ); my $encoding = first { defined $_ && Encode::find_encoding($_) } @encoding; $html = Encode::decode($encoding, $res->content); return $html; } return; }