引言
写这个小爬虫主要是为了爬校园论坛上的实习信息,主要采用了Requests库
源码
URLs.py
主要功能是根据一个初始url(包含page页面参数)来获得page页面从当前页面数到pageNum的url列表
import redef getURLs(url,attr,pageNum=1): all_links = [] try: Now_page_number = int(re.search(attr+'=(\d+)',url,re.S).group(1)) for i in range(Now_page_number,pageNum + 1): new_url = re.sub(attr+'=\d+',attr+'=%s' % i,re.S) all_links.append(new_url) return all_links except TypeError: print "arguments TypeError:attr should be string."@H_403_17@
uni_2_native.py
由于论坛上爬取得到的网页上的中文都是unicode编码的形式,文本格式都为 XXX;的形式,所以在爬得网站内容后还需要对其进行转换
import sysimport rereload(sys)sys.setdefaultencoding('utf-8')def get_native(raw): tostring = raw while True: obj = re.search('(.*?);',tostring,flags=re.S) if obj is None: break else: raw,code = obj.group(0),obj.group(1) tostring = re.sub(raw,unichr(int(code)),tostring) return tostring@H_403_17@
存入sqlite数据库:saveInfo.py
# -*- Coding: utf-8 -*-import MysqLdbclass savesqlite(): def __init__(self): self.infoList = [] def saveSingle(self,author=None,Title=None,date=None,url=None,reply=0,vIEw=0): if author is None or Title is None or date is None or url is None: print "No info saved!" else: singleDict = {} singleDict['author'] = author singleDict['Title'] = Title singleDict['date'] = date singleDict['url'] = url singleDict['reply'] = reply singleDict['vIEw'] = vIEw self.infoList.append(singleDict) def toMysqL(self): conn = MysqLdb.connect(host='localhost',user='root',passwd='',port=3306,db='db_name',charset='utf8') cursor = conn.cursor() # sql = "select * from info" # n = cursor.execute(sql) # for row in cursor.fetchall(): # for r in row: # print r # print '\n' sql = "delete from info" cursor.execute(sql) conn.commit() sql = "insert into info(Title,author,date,reply,vIEw) values (%s,%s,%s)" params = [] for each in self.infoList: params.append((each['Title'],each['author'],each['url'],each['date'],each['reply'],each['vIEw'])) cursor.executemany(sql,params) conn.commit() cursor.close() conn.close() def show(self): for each in self.infoList: print "author: "+each['author'] print "Title: "+each['Title'] print "date: "+each['date'] print "url: "+each['url'] print "reply: "+str(each['reply']) print "vIEw: "+str(each['vIEw']) print '\n'if __name__ == '__main__': save = savesqlite() save.saveSingle('网','aaa','2008-10-10 10:10:10','www.baIDu.com',1,1) # save.show() save.toMysqL()@H_403_17@
主要爬虫代码
import requestsfrom lxml import etreefrom cc98 import uni_2_native,URLs,saveInfo# 根据自己所需要爬的网站,伪造一个headerheaders ={ 'Accept': '','Accept-EnCoding': '','Accept-Language': '','Connection': '','cookie': '','Host': '','Referer': '','upgrade-insecure-requests': '','User-Agent': ''}url = 'http://www.cc98.org/List.asp?boardID=459&page=1&action='cc98 = 'http://www.cc98.org/'print "get infomation from cc98..."urls = URLs.getURLs(url,"page",50)savetools = saveInfo.savesqlite()for url in urls: r = requests.get(url,headers=headers) HTML = uni_2_native.get_native(r.text) selector = etree.HTML(HTML) content_tr_List = selector.xpath('//form/table[@]/tbody/tr') for each in content_tr_List: href = each.xpath('./td[2]/a/@href') if len(href) == 0: continue else: # print len(href) # not very well using for,though just one element in List # but I don't kNow why I cannot get the data by index for each_href in href: link = cc98 + each_href Title_author_time = each.xpath('./td[2]/a/@Title') # print len(Title_author_time) for info in Title_author_time: info_split = info.split('\n') Title = info_split[0][1:len(info_split[0])-1] author = info_split[1][3:] date = info_split[2][3:] hot = each.xpath('./td[4]/text()') # print len(hot) for hot_num in hot: reply_vIEw = hot_num.strip().split('/') reply,vIEw = reply_vIEw[0],reply_vIEw[1] savetools.saveSingle(author=author,Title=Title,date=date,url=link,reply=reply,vIEw=vIEw)print "All got! Now saving to Database..."# savetools.show()savetools.toMysqL()print "ALL CLEAR! Have Fun!"@H_403_17@
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持编程小技巧。
以上是内存溢出为你收集整理的python定向爬虫校园论坛帖子信息全部内容,希望文章能够帮你解决python定向爬虫校园论坛帖子信息所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)