功能分析:
给定一个 天涯论坛 的帖子URL链接地址,获取从当前地址开始的所有整个 主题帖子的内容。
可以选定包含 非楼主回复内容
1, 先看看天涯论坛帖子中 我们所关心的文档结构:
<body> <div > <h1><span>xxx</span> <span>标题</span></h1> <div > <div >点击数 回复数</div> <!-- 或者没有下面的tag--> <div ID="pagedivtop"> <form> <input type="hIDden" name="IDArticlesList" value="所有的帖子地址ID"> </form> </div> <!--楼主信息--> <table cellspacing="0" cellpadding="0" ID="firstAuthor"><td><a>作者</a>发表日期</td></table> <div ID="pContentdiv"> <!--楼主贴--> <div _app="eleshare"> <!--回复人信息--> <table bgcolor="#f5f9fa" wIDth="100%"><Font size="-1" color="green"><a>作者</a>发表日期</Font></table> <!--回复人帖子--> <div _app="eleshare"></div> </div></body>
2,用HTML::TreeBuilder,LWP::Simple,HTML::Element,Encode库 来实现Reference :CPAN
3,一个文件crawl.pl,使用实例:
Perl crawl.pl http://www.tianya.cn/publicforum/content/funinfo/1/2854941.shtml >ylbg.txt
Perl crawl.pl http://www.tianya.cn/publicforum/content/no05/1/204253.shtml > yslt.txt
4,crawl.pl文件源代码,呵呵,写的比较挫 还挺见谅啊 下载
【注意 引用处的 $$ 被CSDN替换为$了,CSDN编辑器太烂,都不支持Perl】
use strict;
use warnings;use HTML::TreeBuilder;use LWP::Simple;use Encode;no utf8; #pragma tells Perl to switch back to treating the source text as literal bytes in the current lexical scope.#check to print the content of the others# if 1,print the others,else 0,not to printmy $other=1;$other=0 if (@ARGV > 1);my ($url)=@ARGV; if (@ARGV <1){ print "input an URL\n"; exit;} # http://www.baIDu.comif($url !~/^http:/){ $url="http://$url";} my ($pagewrap,$Title,$auther,#auther $clicks,#click number $totalPage,@IDArticlesList,#all Articile ID List $root #HTML document root element );&PrintPage($url);sub InitRootfromURl(){ my ($url)=@_; my $content=get($url); #print "Get HTML document\n"; dIE "Couldn't get the URL $url!" unless defined $content; $root = HTML::TreeBuilder->new_from_content($content); $root->eof( ); # done parsing for this tree}sub PrintPage(){#http://www.tianya.cn/publicforum/content/feeling/1/1323274.sHTML#urlPattern is :#http://www.tianya.cn/publicforum/content/feeling/1#posfitx is:#sHTML my ($link)=@_; my ( $urlPattern,#link pattern $posfitx,#the link postfix $CurrID #input URL'ID ); my @tmp=split(/\//,$link); $urlPattern=join('/',@tmp[0..@tmp-2]); ($CurrID,$posfitx)=split(/\./,$tmp[@tmp-1]); @tmp=(); &InitRootfromURl($link); &InitFromfirstlink($root); return unless ($IDArticlesList[0]); if ($CurrID ne $IDArticlesList[0]) { $link="$urlPattern/$IDArticlesList[0].$posfitx"; &InitRootfromURl($link); &InitFromfirstlink($root); $root->delete(); #the first page will be deleted. } {#This infor below got from the first page. print "Title:";&toGBK($Title) if ($Title); print "\nauther:";&toGBK($auther) if ($auther); print "\nclicks:";&toGBK($clicks) if ($clicks); if ($totalPage){ print "\n";&toGBK($totalPage); print "\n";} } #Print the content infor of the all posts print "正文:\n"; my ($i,$start)=(0,0); foreach my $ID (@IDArticlesList) { $i++; $start=1 if ( !$start && ($ID eq $CurrID)); if($start) { print "\nPage $i\n"; $link="$urlPattern/$ID.$posfitx"; &InitRootfromURl($link); &ParseAllPost($root); $root->delete();;# destroy the document } }}sub toGBK(){ my ($content)=@_; $$content=~ s/牋牋/ /g; #print $$content;return 0; #If your OS is English,please comment it my @chars=split(//,$$content); foreach my $char (@chars) { print encode("gbk",$char); } }sub printContent(){ my ($content)=@_; $$content =~ s/\s+$//g; $$content =~ s/TML::Element=.+$//; &toGBK($content);}sub printReplIEr(){ my ($writer)=@_; return if ($$writer =~ /^\s*$/); $$writer =~ s/\s+//; my $dil="-" x length(Encode::encode_utf8($$writer)); print "\n$dil\n|"; &toGBK($writer); print "|\n$dil\n";}#=====#print content infosub ParsePost(){ my ($node)=@_; foreach my $n ($$node->content_List()) { if ( $n =~ /^HTML::Element=/) { next if (! (my $tag=$n->tag())); #print "$tag\n"; if ( $tag eq 'br'){print "\n"; next;} next if ( ($tag eq 'div') && ($n->attr('class')) && ($n->attr('class') eq 'post-jb')); next if ( $tag eq 'img'); next if ( $tag eq 'span'); next if ( $tag eq 'u'); &ParsePost($n); } { &printContent($n); } }}#Parse allpost#=====#print content infosub ParseAllPost(){ my ($node)=@_; #allpost,HTML::Element my ($tag,$attr,@poster,$writer,$isShow); return unless ($$node); my @a=$$node->look_down(_tag=>'div',class=>'allpost',ID=>'pContentdiv'); return unless ($a[0]); $isShow=1; foreach my $n ($a[0]->content_List()) { next if ( $n !~ /^HTML::Element=/); next if (! ($tag=$n->tag())); #parse the content of writer if ( $isShow && ($tag eq 'div')) { next if (!($attr=$n->attr('class'))); next if ($attr ne 'post'); &ParsePost($n); } #parse the writer if ( $tag eq 'table') { @poster = $n->look_down(_tag=>'a',target=>'_blank'); $writer=$poster[0]->as_text(); if (!$other) { if ($writer eq $auther){$isShow =1;} else {$isShow = 0;} } if ($isShow) { &printReplIEr($writer); } } } #$$node->delete();#Delete all elements}#Initialize the auther info and link info#Parse pagewrap and,find Title,auther,and all linkssub InitFromfirstlink(){ my ($node)=@_; my ($tag,@tmp,@a,$b); #find the 'pagewrap' tag return unless ($$node); @tmp = $$node->look_down(_tag=>'div',class=>'pagewrap'); return unless ($tmp[0]); foreach my $n ($tmp[0]->content_List()) { next if ( $n !~ /^HTML::Element=/); next if (! ($tag=$n->tag())); if ( $tag eq 'h1') { @tmp = $n->look_down(_tag=>'span'); next unless ($tmp[1]); $Title=$tmp[1]->as_text(); @tmp=(); next; } if ( $tag eq 'table') { @tmp = $n->look_down(_tag=>'a'); next unless ($tmp[0]); $auther=$tmp[0]->as_text(); @tmp=(); next; } next if (!($attr=$n->attr('class'))); if ( ($tag eq 'div') && ( $attr eq 'function')) { @tmp = $n->content_List(); { @a=(); @a=$n->look_down(_tag=>'span'); if ( @a > 0) {$totalPage=$a[0]->as_text();} @a=(); } if ($tmp[0]->attr('class') eq 'info') { @a=split(/ /,$tmp[0]->as_text()); $clicks=$a[1]; @a=(); }else{ #print "error:",$tmp[0]->dump(); next; } {#get IDArticlesList @a=(); @a=$n->look_down(_tag=>'input',name=>'IDArticlesList'); if ( @a > 0){ $b=$a[0]->attr('value'); @IDArticlesList=split(/,/,$b); } @a=(); } #print $tmp[1]->dump(); @tmp=(); next; } if ( ($tag eq 'div') && ( $attr eq 'allpost')) { #$$node->detach; last; } }}=cut天涯帖子结构<body> <div > <h1><span>xxx</span> <span>标题</span></h1> <div > <div >点击数 回复数</div> <!-- 或者没有下面的tag--> <div ID="pagedivtop"> <form> <input type="hIDden" name="IDArticlesList" value="所有的帖子地址ID"> </form> </div> <!--楼主信息--> <table cellspacing="0" cellpadding="0" ID="firstAuthor"><td><a>作者</a>发表日期</td></table> <div ID="pContentdiv"> <!--楼主贴--> <div _app="eleshare"> <!--回复人信息--> <table bgcolor="#f5f9fa" wIDth="100%"><Font size="-1" color="green"><a>作者</a>发表日期</Font></table> <!--回复人帖子--> <div _app="eleshare"></div> </div></body>=cut总结
以上是内存溢出为你收集整理的用Perl脚本来抓去 天涯论坛 的帖子,呵呵 看帖子方便多了[网络爬虫]全部内容,希望文章能够帮你解决用Perl脚本来抓去 天涯论坛 的帖子,呵呵 看帖子方便多了[网络爬虫]所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)