python爬虫入门与综合应用

python爬虫入门与综合应用,第1张

概述1.练习一:百度首页进行数据importrequests#发出http请求re=requests.get("https://www.baidu.com")#查看响应状态print(re.status_code)#输出:200#200就是响应的状态码,表示请求成功#我们可以通过res.status_code的值来判断请求是否成功。2.用爬虫下载孔乙己的文章

1.练习一:百度首页进行数据

import requests# 发出http请求re=requests.get("https://www.baIDu.com")# 查看响应状态print(re.status_code)#输出:200#200就是响应的状态码,表示请求成功#我们可以通过res.status_code的值来判断请求是否成功。

2.用爬虫下载孔乙己的文章,网址【https://APIv3.shanbay.com/codetime/articles/mnvdu】

import requests# 发出http请求re = requests.get('https://APIv3.shanbay.com/codetime/articles/mnvdu')# 查看响应状态print('网页的状态码为%s'%re.status_code)with open('鲁迅文章.txt', 'w') as file:  # 将数据的字符串形式写入文件中  print('正在爬取小说')  file.write(re.text)

3.下载dataware的logo:
【re.content用于图片、视频、音频等内容的获取、下载】

import requests# 发出http请求#下载图片res=requests.get('https://www.icode9.com/i/ll/?i=20210424184053989.PNG')# 以二进制写入的方式打开一个名为 info.jpg 的文件with open('datawhale.png','wb') as ff:    # 将数据的二进制形式写入文件中    ff.write(res.content)

4.用 BeautifulSoup解析HTML网页
我们来解析豆瓣读书 top250

import ioimport sysimport requestsfrom bs4 import BeautifulSoup###运行出现乱码时可以修改编码方式#sys.stdout = io.TextIOWrapper(sys.stdout.buffer,enCoding='gb18030')###headers = {  'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'}res = requests.get('https://book.douban.com/top250', headers=headers)soup = BeautifulSoup(res.text, 'lxml')print(soup)

5.自如公寓数据抓取

import requestsfrom bs4 import BeautifulSoupimport randomimport timeimport csvuser_agent = [    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",    "Mozilla/5.0 (windows; U; windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",    "Mozilla/5.0 (windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 firefox/38.0",    "Mozilla/5.0 (windows NT 10.0; WOW64; TrIDent/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",    "Mozilla/5.0 (compatible; MSIE 9.0; windows NT 6.1; TrIDent/5.0)",    "Mozilla/4.0 (compatible; MSIE 8.0; windows NT 6.0; TrIDent/4.0)",    "Mozilla/4.0 (compatible; MSIE 7.0; windows NT 6.0)",    "Mozilla/4.0 (compatible; MSIE 6.0; windows NT 5.1)",    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 firefox/4.0.1",    "Mozilla/5.0 (windows NT 6.1; rv:2.0.1) Gecko/20100101 firefox/4.0.1",    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",    "Opera/9.80 (windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",    "Mozilla/4.0 (compatible; MSIE 7.0; windows NT 5.1; Maxthon 2.0)",    "Mozilla/4.0 (compatible; MSIE 7.0; windows NT 5.1; TencentTraveler 4.0)",    "Mozilla/4.0 (compatible; MSIE 7.0; windows NT 5.1)",    "Mozilla/4.0 (compatible; MSIE 7.0; windows NT 5.1; The World)",    "Mozilla/4.0 (compatible; MSIE 7.0; windows NT 5.1; TrIDent/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",    "Mozilla/4.0 (compatible; MSIE 7.0; windows NT 5.1; 360SE)",    "Mozilla/4.0 (compatible; MSIE 7.0; windows NT 5.1; Avant browser)"]#爬取自如前50页def get_info():    csvheader=['名称','面积','朝向','户型','位置','楼层','是否有电梯','建成时间',' 门锁','绿化']    with open('wuhan_ziru.csv', 'a+', newline='') as csvfile:        writer  = csv.writer(csvfile)        writer.writerow(csvheader)        for i in range(1,50):  #总共有50页            print('正在爬取自如第%s页'%i)            timeList=[1,2,3]            print('有点累了,需要休息一下啦(¬㉨¬)')            time.sleep(random.choice(timeList))   #休息1-3秒,防止给对方服务器过大的压力!!!            url='https://wh.ziroom.com/z/p%s/'%i            headers = {'User-Agent': random.choice(user_agent)}            r = requests.get(url, headers=headers)            r.enCoding = r.apparent_enCoding            soup = BeautifulSoup(r.text, 'lxml')            all_info = soup.find_all('div', class_='info-Box')            print('开始干活咯(๑>؂<๑)')            for info in all_info:                href = info.find('a')                if href !=None:                    href='https:'+href['href']                    try:                        print('正在爬取%s'%href)                        house_info=get_house_info(href)                        writer.writerow(house_info)                    except:                        print('出错啦,%s进不去啦( •̥́ ˍ •̀ू )'%href)def get_info():    csvheader=['名称','面积','朝向','户型','位置','楼层','是否有电梯','建成时间',' 门锁','绿化']    with open('wuhan_ziru.csv', 'a+', newline='') as csvfile:        writer  = csv.writer(csvfile)        writer.writerow(csvheader)        for i in range(1,50):  #总共有50页            print('正在爬取自如第%s页'%i)            timeList=[1,2,3]            print('有点累了,需要休息一下啦(¬㉨¬)')            time.sleep(random.choice(timeList))   #休息1-3秒,防止给对方服务器过大的压力!!!            url='https://wh.ziroom.com/z/p%s/'%i            headers = {'User-Agent': random.choice(user_agent)}            r = requests.get(url, headers=headers)            r.enCoding = r.apparent_enCoding            soup = BeautifulSoup(r.text, 'lxml')            all_info = soup.find_all('div', class_='info-Box')            print('开始干活咯(๑>؂<๑)')            for info in all_info:                href = info.find('a')                if href !=None:                    href='https:'+href['href']                    try:                        print('正在爬取%s'%href)                        house_info=get_house_info(href)                        writer.writerow(house_info)                    except:                        print('出错啦,%s进不去啦( •̥́ ˍ •̀ू )'%href)def get_house_info(href):    #得到房屋的信息    time.sleep(1)    headers = {'User-Agent': random.choice(user_agent)}    response = requests.get(url=href, headers=headers)    response=response.content.decode('utf-8', 'ignore')    soup = BeautifulSoup(response, 'lxml')    name = soup.find('h1', class_='Z_name').text    sinfo=soup.find('div', class_='Z_home_b clearfix').find_all('dd')    area=sinfo[0].text    orIEn=sinfo[1].text    area_type=sinfo[2].text    dinfo=soup.find('ul',class_='Z_home_o').find_all('li')    location=dinfo[0].find('span',class_='va').text    loucen=dinfo[1].find('span',class_='va').text    dianti=dinfo[2].find('span',class_='va').text    niandai=dinfo[3].find('span',class_='va').text    mensuo=dinfo[4].find('span',class_='va').text    lvhua=dinfo[5].find('span',class_='va').text    ['名称','面积','朝向','户型','位置','楼层','是否有电梯','建成时间',' 门锁','绿化']    room_info=[name,area,orIEn,area_type,location,loucen,dianti,niandai,mensuo,lvhua]    return room_infoif __name__ == '__main__':    get_info()

6.36kr信息抓取与邮件发送

import requestsimport randomfrom bs4 import BeautifulSoupimport smtplib  # 发送邮件模块from email.mime.text import MIMEText  # 定义邮件内容from email.header import header  # 定义邮件标题smtpserver = 'smtp.qq.com'# 发送邮箱用户名密码user = ''password = ''# 发送和接收邮箱sender = ''receive = ''user_agent = [    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",    "Mozilla/5.0 (windows; U; windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",    "Mozilla/5.0 (windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 firefox/38.0",    "Mozilla/5.0 (windows NT 10.0; WOW64; TrIDent/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",    "Mozilla/5.0 (compatible; MSIE 9.0; windows NT 6.1; TrIDent/5.0)",    "Mozilla/4.0 (compatible; MSIE 8.0; windows NT 6.0; TrIDent/4.0)",    "Mozilla/4.0 (compatible; MSIE 7.0; windows NT 6.0)",    "Mozilla/4.0 (compatible; MSIE 6.0; windows NT 5.1)",    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 firefox/4.0.1",    "Mozilla/5.0 (windows NT 6.1; rv:2.0.1) Gecko/20100101 firefox/4.0.1",    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",    "Opera/9.80 (windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",    "Mozilla/4.0 (compatible; MSIE 7.0; windows NT 5.1; Maxthon 2.0)",    "Mozilla/4.0 (compatible; MSIE 7.0; windows NT 5.1; TencentTraveler 4.0)",    "Mozilla/4.0 (compatible; MSIE 7.0; windows NT 5.1)",    "Mozilla/4.0 (compatible; MSIE 7.0; windows NT 5.1; The World)",    "Mozilla/4.0 (compatible; MSIE 7.0; windows NT 5.1; TrIDent/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",    "Mozilla/4.0 (compatible; MSIE 7.0; windows NT 5.1; 360SE)",    "Mozilla/4.0 (compatible; MSIE 7.0; windows NT 5.1; Avant browser)"]def main():    print('正在爬取数据')    url = 'https://36kr.com/newsflashes'    headers = {'User-Agent': random.choice(user_agent)}    response = requests.get(url, headers=headers)    response=response.content.decode('utf-8', 'ignore')    soup = BeautifulSoup(response, 'lxml')    news = soup.find_all('a', class_='item-Title')      news_List=[]    for i in news:        Title=i.get_text()        href='https://36kr.com'+i['href']        news_List.append(Title+'<br>'+href)    info='<br></br>'.join(news_List)    print('正在发送信息')    send_email(info)def send_email(content):    # 通过QQ邮箱发送    Title='36kr快讯'    subject = Title    msg = MIMEText(content, 'HTML', 'utf-8')    msg['Subject'] = header(subject, 'utf-8')    msg['From'] = sender    msg['To'] = receive    # SSL协议端口号要使用465    smtp = smtplib.SMTP_SSL(smtpserver, 465)  # 这里是服务器端口!    # HELO 向服务器标识用户身份    smtp.helo(smtpserver)    # 服务器返回结果确认    smtp.ehlo(smtpserver)    # 登录邮箱服务器用户名和密码    smtp.login(user, password)    smtp.sendmail(sender, receive, msg.as_string())    smtp.quit()if __name__ == '__main__':    main()

注:源自dataware:Datawhale自动化办公课程

总结

以上是内存溢出为你收集整理的python爬虫入门综合应用全部内容,希望文章能够帮你解决python爬虫入门与综合应用所遇到的程序开发问题。

如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/langs/1159174.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-06-01
下一篇 2022-06-01

发表评论

登录后才能评论

评论列表(0条)

保存