'''''
IP反查小工具
http://dns.aizhan.com/202.203.208.8/
'''
import math
import os
import re
import sys
import requests
from bs4 import BeautifulSoup
HOST = 'http://dns.aizhan.com'
#获取页面内容
def get_page(ip, page=None):
url = '{}/{}/'.format(HOST, ip)
if page:
url = '{}/{}'.format(url, page)
r = requests.get(url)
r.raise_for_status()
return r.text
#获取最大的页数
def get_pagenum(ip):
patt = re.compile(r'共有 <span class="red">(\d+)</span> 个域名解析到该IP')
page_one = get_page(ip)
count = patt.search(page_one)
if not count:
return 0
count = int(count.groups()[0])
pagenum = math.ceil(count/20.0)
return int(str(pagenum)), [page_one]
#获取域名列表
def get_domains(ip):
maxpage, pages = get_pagenum(ip)
for x in range(2, maxpage+1):
pages.append(get_page(ip, x))
return pages
#获取最终结果,形式:{url title} 并写入文件中
def html_parse(filepath, pages):
f = open(filepath, "a")
res_dict = {'domain':'', 'title':''}
res_list = []
f.write('<html>')
for page in pages:
soup = BeautifulSoup(page, 'html.parser')
# TO DO: data parse
# 此处需要做数据清理,提取title和domain
# res_dict['domain'] =
# res_dict['title'] =
# f.write(
# '<a href={domain}>{domain}</a>\t\t{title}<br/>'.format(**res_dist)
# )
# res_list.append(res_dict)
f.write('</html>')
f.close()
return res_list
if __name__ == "__main__":
if len(sys.argv) < 3:
print("Usage:reverseIP targetIP Outfile")
else:
ip, outfile = sys.argv[1:3]
if not str(os.path.basename(outfile)).split('.')[-1] == 'html':
print("The outfile must end with '.html' ")
else:
print("The target IP is :%s" % ip)
print("Starting, please wait...")
pages = get_domains(ip)
html_parse(outfile, pages)
print("Success! The path of result file is %s" % outfile)
反查域名的ip规则已经改变了
Python提供了一个HTMLParser模块,可以非常简单的解析HTML
首先考虑如何从如下的HTML中提取信息
<!-- basic.html><HTML>
<HEAD>
<TITLE>Doc Title &amp Intro</TITLE>
</HEAD>
<BODY>
This is my text.
</BODY>
</HTML>
首先定义一个TitleParser类,是标准HTMLParser类的子孙
HTMLParser的feed()方法会调用handle_starttag(), handle_data(), handle_endtag()方法
#! /usr/bin/env python
#coding=utf-8
from htmlentitydefs import entitydefs
from HTMLParser import HTMLParser
import sys
class TitleParser(HTMLParser):
def __init__(self):
self.title = ' '
self.readingtitle = 0
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
if tag == 'title':
self.readingtitle = 1
def handle_data(self, data):
if self.readingtitle:
self.title += data
def handle_endtag(self, tag):
if tag == 'title':
self.readingtitle = 0
def handle_entityref(self, name):
if entitydefs.has_key(name):
self.handle_data(entitydefs[name])
else:
self.handle_data('&' + name + '')
def gettitle(self):
return self.title
fd = open(sys.argv[1])
tp = TitleParser()
tp.feed(fd.read())
print "Title is:", tp.gettitle()
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)