import re# 拆分字符串one = 'asdfsdfas'# 标准是以s为拆分pattern = re.compile('s')result = pattern.split(one)print(result)返回:
['a', 'df', 'dfa', '']Test2(正则表达式 - 匹配中文):代码1:
# 匹配中文two = '<h2 tID="tID-YkerKe" ID="hID-Htc8Nb">Test4(正则表达式 - 纯数字的正则):</h2>'# python中匹配中间 [a-z] unicode的范围pattern = re.compile('[\u4e00-\u9fa5]')result = pattern.findall(two)print(result)匹配中文返回1:
['正', '则', '表', '达', '式', '纯', '数', '字', '的', '正', '则']代码2:
import re# 匹配中文two = '<h2 tID="tID-YkerKe" ID="hID-Htc8Nb">Test4(正则表达式 - 纯数字的正则):</h2>'# python中匹配中间 [a-z] unicode的范围pattern = re.compile('[\u4e00-\u9fa5]+')result = pattern.findall(two)print(result)返回2:
['正则表达式', '纯数字的正则']Test3(正则表达式 - 网站爬取):代码:
import reimport requestsurl = 'https://news.baIDu.com/'handers = { "User-Agent": "Mozilla/5.0 (windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36"}data = requests.get(url, headers=handers).content.decode()with open('02news.HTML', 'w', enCoding='utf-8')as f: f.write(data)返回:Test4:(正则表达式 - 新闻页面简单爬取):代码1:
# Coding=gbkimport reimport requestsurl = 'https://news.baIDu.com/'handers = { "User-Agent": "Mozilla/5.0 (windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36"}data = requests.get(url, headers=handers).content.decode()# '<a href="http://politics.people.com.cn/n1/2021/0303/c1001-32040808.HTML" target="_blank" mon="ct=1&a=1&c=top&pn=0">人民的信心和支持就是我们国家奋进的力量</a>'pattern = re.compile('<a href="(.*?)" target="_blank" mon="(.*?)">(.*?)</a>')result = pattern.findall(data)print(result)返回1:代码2:
# Coding=gbkimport reimport requestsurl = 'https://news.baIDu.com/'handers = { "User-Agent": "Mozilla/5.0 (windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36"}data = requests.get(url, headers=handers).content.decode()# '<a href="http://politics.people.com.cn/n1/2021/0303/c1001-32040808.HTML" target="_blank" mon="ct=1&a=1&c=top&pn=0">人民的信心和支持就是我们国家奋进的力量</a>'# pattern = re.compile('<a(.*?)</a>')result = pattern.findall(data)print(result)返回2:·XPATH:Test1(xpath基本用法):代码:
# Coding=gbkimport reimport requestsfrom lxml import etreeurl = 'https://news.baIDu.com/'handers = { "User-Agent": "Mozilla/5.0 (windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36"}data = requests.get(url, headers=handers).content.decode()# 1.转解析类型xpath_data = etree.HTML(data)# xpath# 2.调用xpath方法result1 = xpath_data.xpath('/HTML/head/Title/text()')result2 = xpath_data.xpath('//a/text()')result3 = xpath_data.xpath('//a[@mon="ct=1&a=1&c=top&pn=0"]/text()')result4 = xpath_data.xpath('//a[@mon="ct=1&a=1&c=top&pn=0"]/@href')result5 = xpath_data.xpath('//li/a/text()')print(result1)print(result2)print(result3)print(result4)print(result5)返回:注:
xpath语法:1.节点: /2.跨节点://3.精确的标签://a[@属性="属性值"]4.标签包裹的内容 /text()5.属性:@href6.xpath返回的数据类型 —— List
xpath下标是从1开始的;只能取平级关系的标签Test2(实战):
以https://www.cnblogs.com/3cH0-Nu1L/default.HTML?page=为例77
代码:# Coding=gbkimport requestsfrom lxml import etreeclass BkySpIDer(object): def __init__(self): self.base_url = 'https://www.cnblogs.com/3cH0-Nu1L/default.HTML?page=' self.handers = { "User-Agent": "Mozilla/5.0 (windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36" } # 1.发请求 def get_response(self, url): response = requests.get(url, headers=self.handers) data = response.content.decode() return data # 2.解析数据 def parse_data(self, data): # 使用xpath解析当前页面所有的随笔Title # 1.转类型 x_data = etree.HTML(data) # 2.根据xpath路径解析 Title_List = x_data.xpath('//a[@]/text()') url_List = x_data.xpath('//a[@]/@href') print(result) # 3.保存数据 def save_data(self, data): with open('05bky.HTML', 'w', enCoding='utf-8')as f: f.write(data) # 4.启动 def run(self): # 1.拼接完整URL url = self.base_url + '2' # 2.发请求 data = self.get_response(url) # 3.做解析 self.parse_data(data) # 4.保存 #self.save_data(data)BkySpIDer().run()总结
以上是内存溢出为你收集整理的Python爬虫学习笔记(五)全部内容,希望文章能够帮你解决Python爬虫学习笔记(五)所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)