import reimport urllibfrom urllib import requestfrom collections import dequefrom bs4 import BeautifulSoupimport lxmlimport sqlite3import jIEbaimport mathconn=sqlite3.connect("vIEwsdu.db")c=conn.cursor()c.execute('select count(*) from doc')N=1+c.fetchall()[0][0]target=input("请输入搜索词")seggen=jIEba.cut_for_search(target)#搜索内容进行分词score={}for word in seggen: print('得道查询词: ',word) tf={} c.execute('select List from word where term=?',(word,)) result=c.fetchall() if len(result)>0: docList=result[0][0] docList=docList.split(' ') docList=[int(x) for x in docList] df=len(set(docList)) IDf=math.log(N/df) print('IDf: ',IDf) for num in docList: if num in tf: tf[num]=tf[num]+1 else: tf[num]=1 for num in tf: if num in score: score[num]=score[num]+tf[num]*IDf else: score[num]=tf[num]*IDfsortedList=sorted(score.items(),key=lambda d:d[1],reverse=True)print('得分列表',sortedList)cnt=0cnt1=0for num,docscore in sortedList: cnt=cnt+1 c.execute('select link from doc where ID=?',(num,)) url=c.fetchall()[0][0] print(url,'得分: ',docscore) try: respnse=request.urlopen(url) content=respnse.read().decode('utf-8') except: print("网页读取出错") continue soup=BeautifulSoup(content,'lxml') Title=soup.Title if Title==None: print('No Title.') else: Title=Title.text print(Title) if cnt>10: breakif cnt==0: print("无搜索结果")
# search_engine_build-2.py(爬取并保存)import sysfrom collections import dequeimport urllibfrom urllib import requestimport refrom bs4 import BeautifulSoupimport lxmlimport sqlite3import jIEbaurl = 'https://www.fjut.edu.cn/561/List.htm' # 'http://www.zut.edu.cn'#入口unvisited = deque() # 待爬取链接的列表,使用广度优先搜索visited = set() # 已访问的链接集合unvisited.append(url)conn = sqlite3.connect("vIEwsdu.db")c = conn.cursor()# 在create table之前先drop table是因为我之前测试的时候已经建过table了,所以再次运行代码的时候得把旧的table删了重新建#c.execute('drop table doc')c.execute('create table doc (ID int primary key,link text)')# c.execute('drop table word')c.execute('create table word (term varchar(25) primary key,List text)')conn.commit()conn.close()print('***************开始!*****************************')cnt = 0print('开始。。。。。 ')while unvisited: url = unvisited.popleft() visited.add(url) cnt += 1 print('开始抓取第', cnt, '个链接:', url) # 爬取网页内容 try: response = request.urlopen(url) content = response.read().decode('utf-8') except: continue # 寻找下一个可爬的链接,因为搜索范围是网站内,所以对链接有格式要求,这个格式要求根据具体情况而定 # 解析网页内容,可能有几种情况,这个也是根据这个网站网页的具体情况写的 soup = BeautifulSoup(content, 'lxml') all_a = soup.find_all('a', {'target': "_blank"}) # 本页面所有的新闻链接<a> for a in all_a: x = a.attrs['href'] # 网址 if not re.match(r'^/', x): continue x = 'https://www.fjut.edu.cn' + x # print(x) if (x not in visited) and (x not in unvisited): unvisited.append(x) a = soup.find('a', {'class': "next"}) # 下一页<a> if a != None: x = a.attrs['href'] # 网址 x = 'https://www.fjut.edu.cn/' + x if (x not in visited) and (x not in unvisited): unvisited.append(x) Title = soup.Title article = soup.find('div', class_="Article_Content") if article and article.find_all(re.compile("^p")): all_p = article.find_all(re.compile("^p")) article = "" for p in all_p: p_str = p.get_text("", strip=True) p_str = ''.join(p_str.split()) article += p_str print(article) elif article and article.find_all(re.compile("^div")): all_p = article.find_all(re.compile("^div")) article = "" for p in all_p: p_str = p.get_text("", strip=True) p_str = ''.join(p_str.split()) article += p_str print(article) else: article = '' if Title == None: print('无内容的页面。') continue else: Title = Title.text Title = ''.join(Title.split()) print('网页标题:', Title) # 提取出的网页内容存在Title,article字符串里,对它们进行中文分词 seggen = jIEba.cut_for_search(Title) segList = List(seggen) seggen = jIEba.cut_for_search(article) segList += List(seggen) # 数据存储 conn = sqlite3.connect("vIEwsdu.db") c = conn.cursor() c.execute('insert into doc values(?,?)', (cnt, url)) # 对每个分出的词语建立词表 for word in segList: # print(word) # 检验看看这个词语是否已存在于数据库 c.execute('select List from word where term=?', (word,)) result = c.fetchall() # 如果不存在 if len(result) == 0: docListstr = str(cnt) c.execute('insert into word values(?,?)', (word, docListstr)) # 如果已存在 else: docListstr = result[0][0] # 得到字符串 docListstr += ' ' + str(cnt) c.execute('update word set List=? where term=?', (docListstr, word)) conn.commit() conn.close()print('词表建立完毕')
总结
以上是内存溢出为你收集整理的python爬取新闻全部内容,希望文章能够帮你解决python爬取新闻所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)