python爬取新闻

python爬取新闻,第1张

概述importreimporturllibfromurllibimportrequestfromcollectionsimportdequefrombs4importBeautifulSoupimportlxmlimportsqlite3importjiebaimportmathconn=sqlite3.connect("viewsdu.db")c=conn.cursor()c.execute('selectcount(
import reimport urllibfrom urllib import requestfrom collections import dequefrom bs4 import BeautifulSoupimport lxmlimport sqlite3import jIEbaimport mathconn=sqlite3.connect("vIEwsdu.db")c=conn.cursor()c.execute('select count(*) from doc')N=1+c.fetchall()[0][0]target=input("请输入搜索词")seggen=jIEba.cut_for_search(target)#搜索内容进行分词score={}for word in seggen:    print('得道查询词: ',word)    tf={}    c.execute('select List from word where term=?',(word,))    result=c.fetchall()    if len(result)>0:        docList=result[0][0]        docList=docList.split(' ')        docList=[int(x) for x in docList]        df=len(set(docList))        IDf=math.log(N/df)        print('IDf: ',IDf)        for num in docList:            if num in tf:                tf[num]=tf[num]+1            else:                tf[num]=1        for num in tf:            if num in score:                score[num]=score[num]+tf[num]*IDf            else:                score[num]=tf[num]*IDfsortedList=sorted(score.items(),key=lambda d:d[1],reverse=True)print('得分列表',sortedList)cnt=0cnt1=0for num,docscore in sortedList:    cnt=cnt+1    c.execute('select link from doc where ID=?',(num,))    url=c.fetchall()[0][0]    print(url,'得分: ',docscore)    try:        respnse=request.urlopen(url)        content=respnse.read().decode('utf-8')    except:        print("网页读取出错")        continue    soup=BeautifulSoup(content,'lxml')    Title=soup.Title    if Title==None:        print('No Title.')    else:        Title=Title.text        print(Title)    if cnt>10:        breakif cnt==0:    print("无搜索结果")

 

# search_engine_build-2.py(爬取并保存)import sysfrom collections import dequeimport urllibfrom urllib import requestimport refrom bs4 import BeautifulSoupimport lxmlimport sqlite3import jIEbaurl = 'https://www.fjut.edu.cn/561/List.htm'  # 'http://www.zut.edu.cn'#入口unvisited = deque()  # 待爬取链接的列表,使用广度优先搜索visited = set()  # 已访问的链接集合unvisited.append(url)conn = sqlite3.connect("vIEwsdu.db")c = conn.cursor()#  在create table之前先drop table是因为我之前测试的时候已经建过table了,所以再次运行代码的时候得把旧的table删了重新建#c.execute('drop table doc')c.execute('create table doc (ID int primary key,link text)')# c.execute('drop table word')c.execute('create table word (term varchar(25) primary key,List text)')conn.commit()conn.close()print('***************开始!*****************************')cnt = 0print('开始。。。。。 ')while unvisited:    url = unvisited.popleft()    visited.add(url)    cnt += 1    print('开始抓取第', cnt, '个链接:', url)    #  爬取网页内容    try:        response = request.urlopen(url)        content = response.read().decode('utf-8')    except:        continue    #  寻找下一个可爬的链接,因为搜索范围是网站内,所以对链接有格式要求,这个格式要求根据具体情况而定    #  解析网页内容,可能有几种情况,这个也是根据这个网站网页的具体情况写的    soup = BeautifulSoup(content, 'lxml')    all_a = soup.find_all('a', {'target': "_blank"})  # 本页面所有的新闻链接<a>    for a in all_a:        x = a.attrs['href']  # 网址        if not re.match(r'^/', x):            continue        x = 'https://www.fjut.edu.cn' + x        # print(x)        if (x not in visited) and (x not in unvisited):            unvisited.append(x)    a = soup.find('a', {'class': "next"})  # 下一页<a>    if a != None:        x = a.attrs['href']  # 网址        x = 'https://www.fjut.edu.cn/' + x        if (x not in visited) and (x not in unvisited):            unvisited.append(x)    Title = soup.Title    article = soup.find('div', class_="Article_Content")    if article and article.find_all(re.compile("^p")):        all_p = article.find_all(re.compile("^p"))        article = ""        for p in all_p:            p_str = p.get_text("", strip=True)            p_str = ''.join(p_str.split())            article += p_str        print(article)    elif article and article.find_all(re.compile("^div")):        all_p = article.find_all(re.compile("^div"))        article = ""        for p in all_p:            p_str = p.get_text("", strip=True)            p_str = ''.join(p_str.split())            article += p_str        print(article)    else:        article = ''    if Title == None:        print('无内容的页面。')        continue    else:        Title = Title.text        Title = ''.join(Title.split())    print('网页标题:', Title)    #  提取出的网页内容存在Title,article字符串里,对它们进行中文分词    seggen = jIEba.cut_for_search(Title)    segList = List(seggen)    seggen = jIEba.cut_for_search(article)    segList += List(seggen)    #  数据存储    conn = sqlite3.connect("vIEwsdu.db")    c = conn.cursor()    c.execute('insert into doc values(?,?)', (cnt, url))    # 对每个分出的词语建立词表    for word in segList:        # print(word)        # 检验看看这个词语是否已存在于数据库        c.execute('select List from word where term=?', (word,))        result = c.fetchall()        # 如果不存在        if len(result) == 0:            docListstr = str(cnt)            c.execute('insert into word values(?,?)', (word, docListstr))        # 如果已存在        else:            docListstr = result[0][0]  # 得到字符串            docListstr += ' ' + str(cnt)            c.execute('update word set List=? where term=?', (docListstr, word))    conn.commit()    conn.close()print('词表建立完毕')

 

总结

以上是内存溢出为你收集整理的python爬取新闻全部内容,希望文章能够帮你解决python爬取新闻所遇到的程序开发问题。

如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/langs/1185225.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-06-03
下一篇 2022-06-03

发表评论

登录后才能评论

评论列表(0条)

保存