import requests import logging import json import os # 目标网站:https://spa1.scrape.center/ # 定义一些基本配置 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s') # https://spa1.scrape.center/api/movie/?limit=10&offset=0 # https://spa1.scrape.center/api/movie/?limit=10&offset=10 # https://spa1.scrape.center/api/movie/?limit=10&offset=20 # limit一直为10, 正好对应每页10条数据;offset依次变大,页面每加1,offset就加10 INDEX_URL = "https://spa1.scrape.center/api/movie/?limit=10&offset={offset}" DETAIL_URL = "https://spa1.scrape.center/api/movie/{id}" TOTAL_PAGE = 2 # 想要爬取的页面数量 RESULE_DIR = "Ajax_result" if not os.path.exists(RESULE_DIR): os.makedirs(RESULE_DIR) # 定义爬取方法,返回json数据 def scrape_api(url): logging.info(f"scraping {url}...") try: response = requests.get(url) if response.status_code == 200: return response.json() # 如果上述语句没有执行 logging.error(f"get invalid status code {response.status_code} while scrape {url}") except requests.RequestException: logging.error(f"error occurred while scrape {url}", exc_info=True) # 定义爬取列表页的方法,返回列表页的json数据 def scrape_index(page): url = INDEX_URL.format(offset=10 * (page - 1)) return scrape_api(url) # 定义爬取详情页的方法,返回详情页的json数据 def scrape_detail(id): url = DETAIL_URL.format(id=id) return scrape_api(url) def save_data(data): name = data.get("name") data_path = f"{RESULE_DIR}/{name}.json" json.dump(data, open(data_path, mode='w', encoding='utf-8'), ensure_ascii=False, indent=2) def main(): for page in range(1, TOTAL_PAGE+1): index_data = scrape_index(page) for item in index_data.get("results"): id = item.get("id") detail_data = scrape_detail(id) save_data(detail_data) logging.info(f"detail data saved successfully") if __name__ == '__main__': main()
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)