Perl爬虫研究_语言综合

概述这几天忙着做项目和一些W3A的测试,没啥时间研究别的. 今天趁着快放假,也给自己放放假吧.看了下云总写的Perl爬虫,发现有多处不懂. 但是部分地方算是理解了,看来目标还是很遥远的. 给代码加了下注释,不过太累的,准备睡觉了..写了部分,改天补全.. 凑合着看吧.... #!/usr/bin/perluse strict; use warnings; use threads; use thre

这几天忙着做项目和一些W3A的测试,没啥时间研究别的.

今天趁着快放假,也给自己放放假吧.看了下云总写的Perl爬虫,发现有多处不懂.

但是部分地方算是理解了,看来目标还是很遥远的.

给代码加了下注释,不过太累的,准备睡觉了..写了部分,改天补全..

凑合着看吧....

#!/usr/bin/perluse strict; use warnings; use threads; use threads::shared; use Thread::Queue; use Thread::Semaphore; use Bloom::Filter; use URI::URL; use Web::Scraper; # 设置线程数量 my $max_threads = 15; # 设置目标my $base_url = $ARGV[0] || 'http://www.icylife.net'; # 这个没解释出来(P1)my $host = URI::URL->new($base_url)->host; # 建立一个线程队列my $queue = Thread::Queue->new( ); # 创建信号量容器并锁定峰值my $semaphore = Thread::Semaphore->new( $max_threads ); # 每次创建一个信号量my $mutex = Thread::Semaphore->new( 1 ); # BS算法,用于测试URL是否重复 my $filter = shared_clone( Bloom::Filter->new(capacity => 1000,error_rate => 0.0001) ); # 将目标放入任务队列 $queue->enqueue( $base_url ); # 放入需要对比的第一个成员$filter->add( $base_url ); while( 1 ) { # join all threads which can be joined #my $joined = 0;        foreach ( threads->List(threads::joinable) ) { #$joined ++;                $_->join( ); } #print $joined," joined\n"; # if there are no url need process.        my $item = $queue->pending(); if( $item == 0 ) { my $active = threads->List(threads::running); # there are no active thread,we finish the job                if( $active == 0 ) { print "All done!\n"; last; } # we will get some more url if there are some active threads,just wait for them                else { #print "[MAIN] 0 URL,but $active active thread\n";                        sleep 1; next; } } # if there are some url need process #print "[MAIN] $item URLn";        $semaphore->down; #print "[MAIN]Create thread.n";        threads->create( \&ProcessUrl ); } # join all threads which can be joinedforeach ( threads->List() ) { $_->join( ); } sub ProcessUrl { my $scraper = scraper { process '//a','links[]' => '@href'; }; my $res; my $link; while( my $url = $queue->dequeue_nb() ) { eval { $res = $scraper->scrape( URI->new($url) )->{'links'}; }; if( $@ ) { warn "$@\n"; next; } next if (! defined $res ); #print "there are ".scalar(threads->List(threads::running))." threads,",$queue->pending()," urls need process.\n";                 foreach( @{$res} ) { $link = $_->as_string; $link = URI::URL->new($link,$url); # not http and not https?                        next if( $link->scheme ne 'http' && $link->scheme ne 'https' ); # another domain?                        next if( $link->host ne $host ); $link = $link->abs->as_string; if( $link =~ /(.*?)#(.*)/ ) { $link = $1; } next if( $link =~ /.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|doc|Js|CSS|docx|xls|xlsx)$/i ); $mutex->down(); if( ! $filter->check($link) ) { print $filter->key_count()," ",$link,"\n"; $filter->add($link); $queue->enqueue($link); } $mutex->up(); undef $link; } undef $res; } undef $scraper; $semaphore->up( ); }