30张图片(easy!)好多好多图片!
复习C++之前先玩了一下爬虫,这学期属于是入了小凯的坑爬不出来了qwq
先看结果捏~
学习了这篇:
https://zhuanlan.zhihu.com/p/292360978
直接放代码惹(其他的百度图片我觉得直接改url就行捏)
然后headers可以参考一下 找到自己的header!
import requests from bs4 import BeautifulSoup url='https://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&dyTabStr=MCwzLDEsNiw0LDUsNywyLDgsOQ%3D%3D&word=%E7%8E%8B%E4%BF%8A%E5%87%AF' headers={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36' } r=requests.get(url=url,headers=headers) r.encoding=r.apparent_encoding soup=BeautifulSoup(r.text,'html.parser') print(soup.prettify()) from lxml import etree import re tree=etree.HTML(r.text) pic_url = re.findall('"objURL":"(.*?)",',r.text,re.S) for each in pic_url: print (each) try: pic= requests.get(each, timeout=10) except requests.exceptions.ConnectionError: print ('错误:当前图片无法下载') continue #在运行路径下新建个文件夹命名为pictures string='王俊凯\'+ str(i) + '.jpg' fp = open(string,'wb') fp.write(pic.content) fp.close() i+=1好多好多图片!
#-*- coding:utf-8 -*- import requests import os import time import random import re from bs4 import BeautifulSoup from selenium import webdriver driver=webdriver.Chrome() #driver.implicitly_wait(30)#设置隐式等待 url='https://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&dyTabStr=MCwzLDEsNiw0LDUsNywyLDgsOQ%3D%3D&word=%E7%8E%8B%E4%BF%8A%E5%87%AF' driver.get(url) time.sleep(5)#程序等待30秒运行,anaconda可以不用,pycharm建议用这行代码 driver.enconding=r.encoding soup=BeautifulSoup(driver.page_source,'html.parser') body=soup.find('div',attrs={'id':'wrapper'}) body=body.find('div',attrs={'id':'imgContainer'}) body=body.find('div',attrs={'id':'imgid'}) #body=body.find('div',attrs={'class':'imgpage'}) #body=body.find('ul',attrs={'class':'imglist clearfix pageNum0'}) i=0 count=0 for txt in body.find_all('div',attrs={'class':'imgpage'}): txt=txt.find('ul',attrs={'class':'imglist clearfix pageNum'+str(i)}) i+=1 for img in txt.find_all('li',attrs={'class':'imgitem'}): try: #img=img.find('img') # print('n') img=img.attrs['data-objurl'] print(img) image=requests.get(img) string='王俊凯\'+ str(count) + '.jpg' fp = open(string,'wb') fp.write(image.content) fp.close() count+=1 except: print('wrong!') print('爬取图片总数:',count) print('共爬取',i,'页')
可以啦~要去好好复习C++了qaq
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)