爬虫基类 -- 简单封装

爬虫基类 -- 简单封装,第1张

前言
  • 下方代码中的get_proxies方法,根据自身情况进行开发

代码
# -*- coding: utf-8 -*-

import hashlib
import random
import requests
from datetime import datetime


class BaseSpider:
    def __init__(self):
        self.sess = requests.Session()

    def make_rand_name(self, rand_times=4) -> str:
        arr = [random.randint(1, 99) for _ in range(rand_times)]
        _name = "{}" * rand_times
        name = _name.format(*arr)
        return name

    def get_curr_time(self) -> str:
        curr_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        return curr_time

    def make_md5(self, s: str) -> str:
        value = hashlib.md5(s.encode()).hexdigest()
        return value

    def make_ua(self) -> str:
        a = random.randint(55, 100)
        c = random.randint(0, 3200)
        d = random.randint(0, 140)
        os_type = [
            '(Windows NT 6.1; WOW64)', '(Windows NT 10.0; WOW64)', '(X11; Linux x86_64)',
            '(Macintosh; Intel Mac OS X 10_12_6)'
        ]
        chrome_version = 'Chrome/{}.0.{}.{}'.format(a, c, d)
        ua = ' '.join(
            ['Mozilla/5.0', random.choice(os_type), 'AppleWebKit/537.36',
             '(KHTML, like Gecko)', chrome_version, 'Safari/537.36']
        )
        return ua

    def make_head(self) -> dict:
        headers = {
            "User-Agent": self.make_ua()
        }
        return headers

    def get_proxies(self) -> dict:
        return {}

    def get_curr_ip(self) -> str:
        url = "https://httpbin.org/ip"
        resp = self.parse(url)
        ip = resp.json()["origin"]
        return ip

    def parse(self, url, timeout=10, retry=2, hope_code=200):
        headers = self.make_head()
        proxies = self.get_proxies()
        for _ in range(retry + 1):
            try:
                resp = self.sess.get(url, headers=headers, proxies=proxies, timeout=timeout)
            except Exception as e:
                print("ERROR  {}".format(e))
            else:
                if resp.status_code == hope_code:
                    return resp
                else:
                    print("WARNING  {}".format(resp.status_code))
        else:
            return None

    def download_html(self, fp: str, url, encoding=None):
        resp = self.parse(url)
        with open(fp, "w", encoding=encoding or "utf8") as f:
            f.write(resp.text)

    def download_img(self, fp: str, url):
        resp = self.parse(url)
        with open(fp, "wb") as f:
            f.write(resp.content)

    def save_file(self, fp: str, data: str):
        with open(fp, "w") as f:
            f.write(data)


if __name__ == '__main__':
    bs = BaseSpider()
    print(bs.get_curr_ip())

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/langs/918650.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-05-16
下一篇 2022-05-16

发表评论

登录后才能评论

评论列表(0条)

保存