返回顶部

收藏

利用perl基础库抓取百度博客,生成wp导入文件

更多

只在ubuntu linux下测试过。

执行方式

linux下使用shell

$perl baidu.pl 百度博客域名

# Author : thicket
# Date : 2013/01/31
# WebSite : hi.baidu.com
# 在当前文件夹生成以日期为文件名的xml文件,可以导入wordpress

use LWP::Simple;
use HTML::Parse;
use HTML::Element;
use URI::URL;
use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response;
use URI::Escape;
use POSIX;

$website = $ARGV[0];

if(!$website){
        print "=== add website ! ===\\n";
        exit;
}

$website = '<a href="http://hi.baidu.com/">http://hi.baidu.com/'.$website;

my($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time());
my $format_time = sprintf("%d-%d-%d",$year+1900,$mon+1,$mday,$hour,$min,$sec);

$file_name = './baidu'.$format_time.'.xml';
if(open(OF,">$file_name")){
#    print OF ("Here is an output line.\\n");

$ua = new LWP::UserAgent;        # 產生 UserAgent 物件

print OF ("$website\\n");

my $pages_totle, $pages_row;

$_ = get "$website?page=1";
($pages_totle, $pages_row) = getPageNum($_);
my $len = ceil($pages_totle/$pages_row);

for ($count = 1; $count <= $len; $count++) {

    $url_ind = "$website?page=$count";
print $url_ind."\\n";

    $request = new HTTP::Request('GET', $url_ind);  # 產生 Request 物件
    $response = $ua->request($request);    # 開始抓取網頁,並將結果傳會 $response
    if ($response->is_success) {     # 若抓取網頁成功,則印出 HTML 原始碼
        $_ = $response->content;

my $rss = '<?xml version="1.0" encoding="UTF-8" ?>
<rss version="2.0"
    xmlns:excerpt="<a href="http://wordpress.org/export/1.2/excerpt/">http://wordpress.org/export/1.2/excerpt/"
    xmlns:content="<a href="http://purl.org/rss/1.0/modules/content/">http://purl.org/rss/1.0/modules/content/"
    xmlns:wfw="<a href="http://wellformedweb.org/CommentAPI/">http://wellformedweb.org/CommentAPI/"
    xmlns:dc="<a href="http://purl.org/dc/elements/1.1/">http://purl.org/dc/elements/1.1/"
    xmlns:wp="<a href="http://wordpress.org/export/1.2/">http://wordpress.org/export/1.2/"
>
<channel>';

        my @words = ($_ =~ m/(<a[^>]+?#reply[^>]+?>)/gi);
        my $words = join('',@words);
        $words =~ s/[\\\\]|#reply//ig;

        my @links = getLink($words);

        foreach(@links){
            $url = $_;
            $request = new HTTP::Request('GET', $url);  # 產生 Request 物件
            $response = $ua->request($request);
            if ($response->is_success) {
                ($date, $title, $tag, $content) = getContent($response->content);
                print '===============================';
                print $url."\\n";
                print $date."\\n".$title."\\n".$tag."\\n";
                @tag = split(' ',$tag);

$rss =  "<item>
    <title>$title</title>
    <link></link>
    <pubDate>Tue, 15 Jan 2013 12:53:41 +0000</pubDate>
    <dc:creator>thicket</dc:creator>
    <guid isPermaLink=\\"false\\"></guid>
    <description></description>
    <content:encoded><![CDATA[$content]]></content:encoded>
    <excerpt:encoded><![CDATA[]]></excerpt:encoded>
    <wp:post_id></wp:post_id>
    <wp:post_date>$date</wp:post_date>
    <wp:post_date_gmt>2013-01-15 12:53:41</wp:post_date_gmt>
    <wp:comment_status>open</wp:comment_status>
    <wp:ping_status>open</wp:ping_status>
    <wp:post_name>";
$rss .= uri_escape($title);
$rss .= "</wp:post_name>
    <wp:status>publish</wp:status>
    <wp:post_parent>0</wp:post_parent>
    <wp:menu_order>0</wp:menu_order>
    <wp:post_type>post</wp:post_type>
    <wp:post_password></wp:post_password>
    <wp:is_sticky>0</wp:is_sticky>";
foreach(@tag){
    $rss .= "
        <category domain=\\"post_tag\\" nicename=";
        $rss .= uri_escape($_);
        $rss .= "><![CDATA[$_]]></category>";
}
    $rss .= "
        <wp:postmeta>
        <wp:meta_key>_edit_last</wp:meta_key>
        <wp:meta_value><![CDATA[1]]></wp:meta_value>
    </wp:postmeta>
    </item>";

print OF ("$rss\\n");
            }else{
                print $response->error_as_HTML;
            }
        }
print OF ("</channel></rss>\\n");
    } else {                # 若抓取網頁不成功,則印出錯誤訊息
        print $response->error_as_HTML;
    }
}

close(OF);
}else{
    print "open file error \\n";
    exit;
}

##########################################################################################

#获取文章连接
sub getLink{
    my @full_url;
    $parsed_html = HTML::Parse::parse_html(@_[0]);
    for (@{ $parsed_html->extract_links("a") }) {
        $link = $_->[0];
        $url = new URI::URL $link;
        push(@full_url, $url->abs($website));
    }
    return @full_url;
}

#获取html
sub getContent{
    $_ = @_[0];
    my @date = ($_ =~ m/<div[^>]+class=content-other-info>\\s*(.+?)\\s*<\\/div>/i);
    my $date = join('',@date);
    $date =~ s/<[^>]*>//g;

    my @title = ($_ =~ m/<h2 class="title content-title">(.+?)<\\/h2>/i);
    my $title = join('',@title);

    my @content = ($_ =~ m/<div id=content[^>]+>(.+?)<\\/div>/i);
    my $content = join('',@content);

    my @tag = ($_ =~ m/<a class="tag"[^>]+>#(.+?)<\\/a>/gi);
    my $tag = join(' ',@tag);
    $tag =~ s/<[^>]*>//g;

    return ($date, $title, $tag, $content);
}

#取得页数
sub getPageNum{
    $_ = @_[0];
    my @pages = ($_ =~ m/allCount.*,/gi);
    @pages = (join("",@pages) =~ m/[0-9]+/gi);
    $pages_totle = join("\\n",@pages);

    my @pages = ($_ =~ m/pageSize.*,/gi);
    @pages = (join("",@pages) =~ m/[0-9]+/gi);
    $pages_row = join("\\n",@pages);

    return ($pages_totle, $pages_row);
}
#该片段来自于http://outofmemory.cn

标签:perl,网络

收藏

0人收藏

支持

0

反对

0

相关聚客文章
  1. 老熊 发表 2012-04-24 16:27:58 为11gR2 Grid Infrastructure增加新的public网络
  2. admin 发表 2013-04-06 12:08:53 判断当前网络是否存在
  3. 发表 2013-05-02 16:26:51 折腾二级域名RSS
  4. 四火 发表 2013-05-26 14:19:42 网络爬虫
  5. P迪 发表 2013-06-27 00:39:50 移动改变全球用户网络行为 平板电脑地位凸显
  6. NinGoo 发表 2012-03-06 14:19:38 用perl清理被注入代码的PHP文件
  7. tst 发表 2013-05-23 07:56:39 谈渗透测试方法和流程
  8. 刘兴亮 发表 2013-08-20 06:10:53 社交网络综合症,你有吗?征集症状
  9. 郑永 发表 2013-09-20 00:34:38 手机搭建php+mysql完美运行wordpress
  10. 发表 2013-10-20 08:35:28 Windows 8.1升级注意事项
  11. Ye, Lu 发表 2013-11-07 09:46:45 UnitedStack与Juniper网络合作加速SDN在OpenStack中的整合
  12. 博主 发表 2010-03-29 16:00:00 perl边学边练(purge脚本)

发表评论