这段代码在网上找的。觉得很不错,准备弄来分析下。。
看别人的代码也是一种另类的学习方法。在学习的过程当中多看别人的代码能够提升自己的理解。
特别是一些自己没有用过的模块,通过这些实例就能知道怎么去使用。
当然,你也可以自己去研究官方那些文档。但是对于我来说,我觉得最快的方法就是看别人写的代码实例。
或许每个人都有点不同吧。
#!/usr/bin/perl # siteindexingbot.pl use warnings; use strict; use LWP::Simple; use LWP::RobotUA; use WWW::RobotRules; use HTML::Parse; use HTML::headParser; use URI::URL; my ($response,$tree,$link,%scanned); # the arrays and hashes used to store page data my (@pages,%Titles,%keywords); my $url = $ARGV[0] or dIE "Usage: siteindexingbot [url]\n"; my $base_url = &globalize_url('/',$url); my $robots_txt = $base_url . '/robots.txt'; my $robot_rules = new WWW::RobotRules ( "indexifIEr/1.0 (libwww-perl-$LWP::VERSION)" ); # look for and parse the robots.txt file if (head($robots_txt)) { print "robots.txt file found OK.\n"; $robot_rules->parse($robots_txt,get($robots_txt)); } else { print "robots.txt file not found.\n"; } # build the user agent my $ua = new LWP::UserAgent ( "indexifIEr/1.0 (libwww-perl-$LWP::VERSION)",'me@here.com',$robot_rules ); #$ua->proxy('http' => 'http://proxy.mylan.com/' ); $ua->timeout(30); $ua->max_size(1024 * 100); $ua->parse_head('TRUE'); &scan($base_url); open (file,">indexed.txt") or dIE "opening indexed.txt: $!"; foreach my $page(@pages) { print file join( "\t",($page,$Titles{$page},$keywords{$page}) ),"\n"; } close (file); exit; sub scan { my $url = shift; print "Scanning '$url':\n"; if ($scanned{$url}) { return; } else { &get_info($url); # this is the extra subroutine $scanned{$url} = 'TRUE'; my @links = &get_links($url); foreach $link(@links) { if ($robot_rules->allowed($link)) { if ($link =~ /^$base_url/i) { my $request = http::Request->new ('head' => $link); my $response = $ua->request($request); my $content_type = $response->header('Content-type'); if ($response->is_error) { print "Dead link to $link found on $url\n"; } else { print "$url links to $link\n"; if ($content_type eq 'text/HTML') { &scan($link); } else { print "$link is not HTML\n"; } } } else { print "$link is not local to $base_url\n"; } } else { print "Access to $link is not allowed by robots.txt\n"; } } } return; } sub globalize_url { my ($link,$referring_url) = @_; my $url_obj = new URI::URL($link,$referring_url); my $absolute_url = $url_obj->abs->as_string; $absolute_url =~ s/^(.+?)#(.+?)$//ig; return $absolute_url; } sub get_links { my $url = shift; my $request = http::Request->new ('GET' => $url); $request->header('Accept' => 'text/HTML'); my $response = $ua->request($request); my $tree = HTML::Parse::parse_HTML($response->content); my $links_ref = $tree->extract_links('a','frame','iframe'); my @links; foreach $link(sort @$links_ref) { push(@links,&globalize_url(${$link}[0],$url)); } return @links; } sub get_info { my $url = shift; my $request = http::Request->new('GET' => $url); $request->header('Accept' => 'text/HTML'); my $response = $ua->request($request); my $HTML = $response->content; my ($Title,$keywords,$type); my $parser = HTML::headParser->new; $parser->parse($HTML); $Title = $parser->header('Title') || 'UnTitled document'; $keywords = $response->header('X-Meta-description') || 'none'; push (@pages,$url); $Titles{$url} = $Title; $keywords{$url} = $keywords; return; }总结
以上是内存溢出为你收集整理的Perl 关键词搜索机器人全部内容,希望文章能够帮你解决Perl 关键词搜索机器人所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)