import requests
from bs4 import BeautifulSoup
import pymysql
def download(url):
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.38"
}
response=requests.get(url,headers=headers)
return response
# 解析网页
def bs(url):
html_text=download(url)
soup=BeautifulSoup(html_text.text,'lxml')
newsList=soup.select('#pane-news .hotnews ul>li')
allnews=[]
for newstTag in newsList:
news_dict={}
#标题
news_dict["title"]=newstTag.select('a')[0].get_text()
#详情url
news_dict["url"]=newstTag.select('a')[0].get('href')
allnews.append(news_dict)
print(allnews)
return allnews
def get_config(host,user,password,db):
db_config={
'host':host,
'user':user,
'password':password,
'db':db
}
return db_config
def getConn(allnews):
db_config=get_config('localhost','root','admin','python')
conn=pymysql.Connect(**db_config)
cur=conn.cursor()
for allnew in allnews:
title=allnew['title']
url=allnew['url']
sql="INSERT INTO baidunews(title,url)VALUES("+"'"+title+"'"+","+"'"+url+"'"+")"
cur.execute(sql)
conn.commit()
cur.close()
conn.close()
print(" *** 作完成")
if __name__ == "__main__":
url="http://news.baidu.com/"
allnews=bs(url)
getConn(allnews)
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)