xpath:
import requests import time import csv from requests import RequestException from lxml import etree def get_one_page(url): try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36', 'cookie': 'bid=bq6bl4kmTuw; douban-fav-remind=1; __gads=ID=591697b17f1fe23a-220f8b8865cf0092:T=1639058234:RT=1639058234:S=ALNI_MZO_48BdnjRSkE5opDvE-zAaE9tAA; ll="108288"; __utmz=223695111.1639215564.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _vwo_uuid_v2=D23ED6BBB550E9DB6F96FC7952ABEC279|a138b60c8a7b45c62db689ccc572217f; push_noty_num=0; push_doumail_num=0; ap_v=0,6.0; __utmz=30149280.1639231925.6.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=30149280.25152; dbcl2="251525456:JrZGQt48U80"; ck=heCZ; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1639235024%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; __utmb=30149280.0.10.1639235088; __utmc=30149280; __utma=30149280.635194538.1639058235.1639231925.1639235088.7; __utma=223695111.1210278848.1639058235.1639226759.1639235088.6; __utmb=223695111.0.10.1639235088; __utmc=223695111; _pk_id.100001.4cf6=79142103d75fe9fc.1639058235.6.1639235410.1639226759.' } response = requests.get(url, headers=headers) if response.status_code == 200: return response.text return '请求失败' except RequestException: return '爬取错误' def parse(): moviesList = [] for i in range(10): url = 'https://movie.douban.com/top250?start=' + str(25 * i) + '&filter=' response = get_one_page(url) html = etree.HTML(response) li = html.xpath('//ol[@]/li') for getresult in li: index = getresult.xpath('./div/div/em/text()')[0] # 电影排名 name = getresult.xpath('./div/div[2]/div[1]/a[1]/span[1]/text()')[0] # 电影名 # movies_director_starring = getresult.xpath('./div/div[2]/div[2]/p[1]/text()')[0].strip() # 导演和主演 director_actor = getresult.xpath('./div/div[2]/div[2]/p[1]/text()')[0].strip('n').strip('xa0') director = director_actor[:director_actor.find('主演')].strip() # 导演 actor = director_actor[director_actor.find('主演'):].strip() # 主演 movies_release_date = getresult.xpath('./div/div[2]/div[2]/p/text()')[1].strip()[0:4] # 发布日期 movies_score = getresult.xpath('./div/div[2]/div[2]/div/span[2]/text()')[0] # 电影评分 movies_score_num = getresult.xpath('./div/div[2]/div[2]/div/span[4]/text()')[0] # 电影评分人数 movies_introduce = getresult.xpath('./div/div[2]/div[2]/p[2]/span/text()')[0] # 电影简介 moviesList.append([index, name, director, actor, movies_release_date, movies_score, movies_score_num, movies_introduce]) time.sleep(2) return moviesList def write_to_file(moviesList): with open('xpath_result.csv', 'w', encoding='utf-8', newline='') as csvfile: writer = csv.writer(csvfile) fieldnames = ['电影排名', '电影名称', '导演', '主演', '发布日期', '评分', '评分人数', '简介'] writer.writerow(fieldnames) for i in moviesList: writer.writerow(i) if __name__ == '__main__': moviesList = parse() write_to_file(moviesList)
beautiful soup:
import requests import time import csv from requests.exceptions import RequestException from bs4 import BeautifulSoup def get_one_page(url): try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36', 'cookie': 'bid=bq6bl4kmTuw; douban-fav-remind=1; __gads=ID=591697b17f1fe23a-220f8b8865cf0092:T=1639058234:RT=1639058234:S=ALNI_MZO_48BdnjRSkE5opDvE-zAaE9tAA; ll="108288"; __utmz=223695111.1639215564.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _vwo_uuid_v2=D23ED6BBB550E9DB6F96FC7952ABEC279|a138b60c8a7b45c62db689ccc572217f; push_noty_num=0; push_doumail_num=0; ap_v=0,6.0; __utmz=30149280.1639231925.6.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=30149280.25152; dbcl2="251525456:JrZGQt48U80"; ck=heCZ; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1639235024%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; __utmb=30149280.0.10.1639235088; __utmc=30149280; __utma=30149280.635194538.1639058235.1639231925.1639235088.7; __utma=223695111.1210278848.1639058235.1639226759.1639235088.6; __utmb=223695111.0.10.1639235088; __utmc=223695111; _pk_id.100001.4cf6=79142103d75fe9fc.1639058235.6.1639235410.1639226759.' } response = requests.get(url, headers=headers) if response.status_code == 200: return response.text return '请求失败' except RequestException: return '爬取错误' def parse(): moviesList = [] for i in range(10): url = 'https://movie.douban.com/top250?start=' + str(25 * i) + '&filter=' response = get_one_page(url) soup = BeautifulSoup(response, 'lxml') li = soup.select('ol li') for getresult in li: index = getresult.find('div', class_='pic').find('em').get_text() # 排名 name = getresult.find('span', class_='title').get_text() # 名称 director_actor = getresult.find(class_='bd').p.get_text().strip().split('n') actor_infos1 = director_actor[0].split('xa0xa0xa0') movie_director = actor_infos1[0] #导演 movie_role = actor_infos1[1] if len(actor_infos1)>1 else "" # 主演 movies_release_date = director_actor[1].strip().split('xa0/xa0')[0] # 发布日期 movies_score = getresult.find(class_='rating_num').get_text() # 评分 movies_score_num = getresult.find(class_='star').get_text().strip().split('n')[2] movies_introduces = getresult.find(class_='inq') movies_introduce = movies_introduces.get_text() if movies_introduces else '' moviesList.append([index, name, movie_director, movie_role, movies_release_date, movies_score, movies_score_num, movies_introduce]) time.sleep(2) return moviesList def write_to_file(moviesList): with open('bs4_result.csv', 'w', encoding='utf-8', newline='') as csvfile: writer = csv.writer(csvfile) fieldnames = ['电影排名', '电影名称', '导演', '主演', '发布日期', '评分', '评分人数', '简介'] writer.writerow(fieldnames) for i in moviesList: writer.writerow(i) if __name__ == '__main__': moviesList = parse() write_to_file(moviesList)
pyquery:
import requests import time import csv from requests import RequestException from pyquery import PyQuery as pq def get_one_page(url): try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36', 'cookie': 'bid=bq6bl4kmTuw; douban-fav-remind=1; __gads=ID=591697b17f1fe23a-220f8b8865cf0092:T=1639058234:RT=1639058234:S=ALNI_MZO_48BdnjRSkE5opDvE-zAaE9tAA; ll="108288"; __utmz=223695111.1639215564.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _vwo_uuid_v2=D23ED6BBB550E9DB6F96FC7952ABEC279|a138b60c8a7b45c62db689ccc572217f; push_noty_num=0; push_doumail_num=0; ap_v=0,6.0; __utmz=30149280.1639231925.6.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=30149280.25152; dbcl2="251525456:JrZGQt48U80"; ck=heCZ; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1639235024%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; __utmb=30149280.0.10.1639235088; __utmc=30149280; __utma=30149280.635194538.1639058235.1639231925.1639235088.7; __utma=223695111.1210278848.1639058235.1639226759.1639235088.6; __utmb=223695111.0.10.1639235088; __utmc=223695111; _pk_id.100001.4cf6=79142103d75fe9fc.1639058235.6.1639235410.1639226759.' } response = requests.get(url, headers=headers) if response.status_code == 200: return response.text return '请求失败' except RequestException: return '爬取错误' def parse(): moviesList = [] for i in range(10): url = 'https://movie.douban.com/top250?start=' + str(25 * i) + '&filter=' response = get_one_page(url) html = pq(response) li = html('div.item').items() for getresult in li: index = getresult.find('.pic em').text() # 电影排名 name = getresult.find('.hd span.title').text().split('/')[0] # 电影名 director_actor = getresult.find('div.bd p:eq(0)').text().split('n')[0] director = director_actor[:director_actor.find('主演')].strip() # 导演 actor = director_actor[director_actor.find('主演'):].strip() # 主演 movies_release_date = getresult.find('div.bd p:eq(0)').text().split('n')[1].strip()[0:4] # 发布日期 movies_score = getresult.find('span.rating_num').text() # 电影评分 movies_score_num = getresult.find('.star span').text().strip().split()[1] # 电影评分人数 movies_introduce = getresult.find('span.inq').text() # 电影简介 moviesList.append([index, name, director, actor, movies_release_date, movies_score, movies_score_num, movies_introduce]) time.sleep(2) return moviesList def write_to_file(moviesList): with open('pyquery_result.csv', 'w', encoding='utf-8', newline='') as csvfile: writer = csv.writer(csvfile) fieldnames = ['电影排名', '电影名称', '导演', '主演', '发布日期', '评分', '评分人数', '简介'] writer.writerow(fieldnames) for i in moviesList: writer.writerow(i) if __name__ == '__main__': moviesList = parse() write_to_file(moviesList)
部分结果截图
全部结果CSV在 我的CSDN的资源中下载
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)