# _*_Coding: utf-8 _*_"""笔趣阁小说网爬虫指定一本小说requestsbs4"""from tyPing import Optional, Dict, List, Tupleimport requestsfrom bs4 import BeautifulSoupdef request_url(url: str, headers: Dict[str, str] = None, params: Dict[str, str] = None) -> requests.Response: """ get 请求 下载网页 :param url: 统一资源定位符 (网址) :param headers: 请求头 一般带上user-agent :param params: 请求参数 requests 自动拼接字典形式的参数 :return: Response 对象 """ res = requests.get(url, headers=headers, params=params) # 如果不返回200 ok引发异常 res.raise_for_status() # 编码 res.enCoding = 'gbk' return resdef parse_chapter(res: requests.Response) -> List[Tuple[str, str]]: """ 解析章节列表 :param res: 返回对象 :return: """ HTML = res.text soup = BeautifulSoup(HTML, 'HTML.parser') # lxml解析更快 urls = [] for dd in soup.find(name='div', class_='Listmain').dl.find_all(name='dd'): url = dd.a['href'] Title = dd.a.get_text(strip=True) urls.append((Title, url)) return urlsdef parser_content(res: requests.Response): """ :param res: :return: """ HTML = res.text soup = BeautifulSoup(HTML, 'HTML.parser') # 列表解析式 contents = [br.get_text(strip=True) for br in soup.find("div", ID='content').find_all('br')] content = '\n'.join(contents) return contentdef main(start_url, base): headers = { 'User-Agent': 'Mozilla/5.0 (windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/91.0.4472.114 Safari/537.36 ' } res = request_url(url=start_url, headers=headers) for Title, url in parse_chapter(res): url = base + url res = request_url(url=url, headers=headers) content = parser_content(res) print(f"{Title}:{url} 爬取完成...")if __name__ == '__main__': # 沧元图 url = "https://www.bqkan8.com/38_38836/" base_url = "https://www.bqkan8.com" # 域名 main(start_url=url, base=base_url)
总结 以上是内存溢出为你收集整理的Python爬虫-笔趣阁小说全部内容,希望文章能够帮你解决Python爬虫-笔趣阁小说所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)