import requests
import os,re
class GetStaticWeb:
def __init__(self) -> None:
self.headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36 Edg/100.0.1185.50'}
def getHtml(self,url):
return requests.get(url,headers=self.headers)
def openHtml(self,res,name='out.html'):
f = open(name, 'w',encoding=res.encoding)
f.write(res.text)
# os.startfile(name)
def analysisHtml(self,res):
# html解码
html=res.text.encode(res.encoding).decode('utf-8')
html=re.sub(r'\r',r'\n',html)
# 获取小说正文
chapter=re.search(r'.*(第.*章.*)<',html).group(1)
print(f'正在获取 {chapter}')
content=re.search(r'[\w\W]*?',html).group()
content=re.sub(r'(<.*?>)|(&.*?;)','',content)
content=re.sub(r'\n\n',r'\n',content)
text=chapter+'\n'+content+'\n'+res.url+'\n\n'
# 获取下一章链接
nextUrl='https://www.99mk.com'+re.search(r'href="(.*)">下一章',html).group(1)
return text,res.url,nextUrl
def saveTxt(self,text,url,name='踏星.txt'):
f = open(name, 'r+',encoding='utf-8')
if not url in f.read():
f.write(text)
return True
return False
def getStartUrl(self,name='踏星.txt'):
f = open(name, 'r',encoding='utf-8')
return f.readlines()[-2]
if __name__=='__main__':
g=GetStaticWeb()
nextUrl=g.getStartUrl()
while 'html' in nextUrl:
res=g.getHtml(nextUrl)
# g.openHtml(res)
text,url,nextUrl=g.analysisHtml(res)
g.saveTxt(text,url)
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)