2021-11-12

2021-11-12,第1张

2021-11-12

糗事百科案例
import requests
from fake_useragent import UserAgent
import re

url = “https://www.qiushibaike.com/text/page/1/”
headers = {
“User-Agent”: UserAgent().random
}

构造请求

response = requests.get(url, headers=headers)
info = response.text

print(info)

infos = re.findall(r’

s* s*(.+)s*’, info)
with open(‘duanzi.txt’, ‘a’, encoding=‘utf-8’) as f:
for info in infos:
f.write(info + “nnn”)

beautifulsoup
#!usr/bin/env python

-- coding:utf-8 --

“”"
@author: yifan
@file: 19beautifulsoup.py
@time: 2021/11/12
@desc:
“”"
from bs4 import BeautifulSoup
from bs4.element import Comment

str = ‘’’

尚学堂 Welcome to SXT Good Good Study ''' soup = BeautifulSoup(str, 'lxml')

print(soup.title)
print(soup.div)

print(soup.div.attrs)
print(soup.div.get(‘class’))
print(soup.div[‘float’])
print(soup.a[‘href’])

print(soup.div.string)
print(type(soup.div.string))
print(soup.div.text)

if type(soup.strong.string) == Comment:
print(soup.strong.string)
print(soup.strong.prettify())
else:
print(soup.strong.text)
print("------------------find_all----------------------")
print(soup.find_all(‘title’))
print(soup.find_all(id=‘title’))
print(soup.find_all(class_=‘info’))
print(soup.find_all(“div”, attrs={‘float’: ‘left’}))

str2 = ‘’’

尚学堂 Welcome to SXT Good Good Study ''' print("--------------------css()---------------------------") print(soup.select('title')) print(soup.select('#title')) print(soup.select('.info')) print(soup.select('div span')) print(soup.select('div > span')) print(soup.select('div')[1].select('a')) print(soup.select('title')[0].text)

#!usr/bin/env python

-- coding:utf-8 --

“”"
@author: yifan
@file: 正则表达.py
@time: 2021/11/12
@desc:
“”"
import re

str1 = “I Study Python3.6 Everyday”
print("-------------match()-----------------")

匹配I字符

m1 = re.match(r’I’, str1)
m2 = re.match(r’w’, str1)
m3 = re.match(r’.’, str1)
m4 = re.match(r’D’, str1)
m5 = re.match(r’i’, str1, re.I)
m6 = re.match(r’S’, str1)

m7 = re.match(r’Study’, str1) # 匹配不到,因为match是从左开始匹配

print(m6.group())
print("-------------search()-----------------")

匹配Study

s1 = re.search(r’Study’, str1)
s2 = re.search(r’Sw+’, str1)

匹配Python3.6

s3 = re.search(r’Pw+.d’, str1)
print(s3.group())
print("-------------findall()-----------------")

查找所有y

f1 = re.findall(r’y’, str1)
print(f1)
print("-------------test()-----------------")
str2 = ‘

bjsxt尚学堂

提取a标签的内容

t1 = re.findall(r’[u4e00-u9fa5]w+’, str2)
t2 = re.findall(r’(.+)’, str2)

提取herf

t3 = re.findall(r’’, str2)
print(t3)
print("-------------sub()-----------------")

将str2的div换成span

su1 = re.sub(r’

(.+) ’, r’ 1’, str2)
print(su1)

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/zaji/5480919.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-12-12
下一篇 2022-12-12

发表评论

登录后才能评论

评论列表(0条)

保存