下面是内存溢出 jb51.cc 通过网络收集整理的代码片段。
内存溢出小编现在分享给大家,也给大家做个参考。
#!/usr/bin/env python#Coding=utf-8""" Author: Anemone filename: getmain.py Last modifIEd: 2015-02-19 16:47 E-mail: [email protected]"""import urllib2from bs4 import BeautifulSoupimport reimport sysreload(sys)sys.setdefaultencoding('utf-8')def getEachArticle(url):# response = urllib2.urlopen('http://www.52duzhe.com/2015_01/duzh20150104.HTML') response = urllib2.urlopen(url) HTML = response.read() soup = BeautifulSoup(HTML)#.decode("utf-8").encode("gbk")) #for i in soup.find_all('div'): # print i,1 Title=soup.find("h1").string writer=soup.find(ID="pub_date").string.strip() _from=soup.find(ID="media_name").string.strip() text=soup.get_text()#.encode("utf-8") main=re.split("BAIDU_CLB.*;",text) result={"Title":Title,"writer":writer,"from":_from,"context":main[1]} return result #new=open("new.txt","w") #new.write(result["Title"]+"\n\n") #new.write(result["writer"]+" "+result["from"]) #new.write(result["context"]) #new.close()def getCatalog(issue): url="http://www.52duzhe.com/"+issue[:4]+"_"+issue[-2:]+"/" firstUrl=url+"duzh"+issue+"01.HTML" firstUrl=url+"index.HTML" duzhe=dict() response = urllib2.urlopen(firstUrl) HTML = response.read() soup=BeautifulSoup(HTML) firstUrl=url+soup.table.a.get("href") response = urllib2.urlopen(firstUrl) HTML = response.read() soup = BeautifulSoup(HTML) all=soup.find_all("h2") for i in all: print i.string duzhe[i.string]=List() for link in i.parent.find_all("a"): href=url+link.get("href") print href while 1: try: article=getEachArticle(href) break except: continue duzhe[i.string].append(article) return duzhedef readDuZhe(duzhe): for eachColumn in duzhe: for eachArticle in duzhe[eachColumn]: print eachArticle["Title"]if __name__ == '__main__':# issue=raw_input("issue(201501):") readDuZhe(getCatalog("201424"))
以上是内存溢出(jb51.cc)为你收集整理的全部代码内容,希望文章能够帮你解决所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
总结以上是内存溢出为你收集整理的文艺一把!Python爬取读者制作PDF!全部内容,希望文章能够帮你解决文艺一把!Python爬取读者制作PDF!所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)