re.text用于文本内容
re.content用于图片、视频、音频等
import requests# 发出http请求re = requests.get('https://APIv3.shanbay.com/codetime/articles/mnvdu')# 查看响应状态print('网页的状态码为%s'%re.status_code)with open('鲁迅文章.txt', 'w') as file: # 将数据的字符串形式写入文件中 print('正在爬取小说') file.write(re.text)
import requests# 发出http请求#下载图片res=requests.get('https://www.icode9.com/i/ll/?i=20210424184053989.PNG')# 以二进制写入的方式打开一个名为 info.jpg 的文件with open('datawhale.png','wb') as ff: # 将数据的二进制形式写入文件中 ff.write(res.content)
2. BeautifulSoup爬取学校官网一级页面、二级页面和三级页面
import Jsonimport requestsfrom bs4 import BeautifulSoupurl_List = []# Get URLsdef getURLs (url): HTML = requests.get(url) soup = BeautifulSoup(HTML.text, 'HTML.parser') links = soup.find_all('a') for link in links: url_href = link.get("href") url_List.append(url_href) # Complement relative path url_str = List(map(lambda x: str(x), url_List)) for i in range(len(url_str)): if(len(url_str[i]) <= 0): continue if url_str[i][0] == "/": url_tail = url_str[i][1:] url_str[i] = url + url_tail # Filter out FST web page and Remove duplicated items url_fst = List(set(filter(lambda x: "https://www.fst" in x, url_str))) url_fst = List(set(filter(lambda x: x[-1]=="/", url_fst))) return url_fst# Save the outputdef save (data): conv = Json.dumps(data) f = open(r"C:\Users\Sandra\Desktop\url\url.txt", "a",enCoding='UTF-8') f.write(conv+"\n") f.close() # Homepageprint("Homepage")result1 = getURLs('https://www.fst.um.edu.mo/')# Subpagesprint("Subpages")result2=[]for i in result1: if i!="https://www.fst.um.edu.mo/": result2=result2+getURLs(i)# Remove same items in both 1st and 2ed layer linksuniq_result2=[]result2_=set(result2)|set(result1)uniq_result2=List(result2_-set(result1))# Subsubpagesprint("Subsubpages")result3=[]for j in uniq_result2: result3=result3+getURLs(j)fst_Urls = List(set(result1+uniq_result2+result3))for item in fst_Urls: save(item)print(len(fst_Urls))
总结 以上是内存溢出为你收集整理的Python办公自动化 | Task 05 Python爬虫入门全部内容,希望文章能够帮你解决Python办公自动化 | Task 05 Python爬虫入门所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)