三个python爬虫项目实例代码

三个python爬虫项目实例代码,第1张

三个python爬虫项目实例代码

这篇文章主要介绍了三个python爬虫项目实例代码,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友可以参考下

爬取内涵段子:

#encoding=utf-8
import urllib2

import re


class neihanba():
  def spider(self):
    '''
    爬虫的主调度器
    '''
    isflow=True#判断是否进行下一页
    page=1
    while isflow:
      url="http://www.neihanpa.com/article/list_5_"+str(page)+".html"
      html=self.load(url)
      self.deal(html,page)
      panduan=raw_input("是否继续(y/n)!")
      if panduan=="y":
 isflow=True
 page+=1
      else:
 isflow=False
  def load(self,url):
    '''
    针对url地址进行全部爬去
    :param url: url地址
    :return: 返回爬去的内容
    '''
    header = {
      "User-Agent": " Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36"
    }
    request = urllib2.Request(url, headers=header)
    response = urllib2.urlopen(request)
    html = response.read()
    return html
  def deal(self,html,page):
    '''
    对之前爬去的内容进行正则匹配,匹配出标题和正文内容
    :param html:之前爬去的内容
    :param page: 正在爬去的页码
    '''
    parrten=re.compile('
  • (.*?)
  • ',re.S) titleList=parrten.findall(html) for title in titleList: parrten1=re.compile('(.*)') ti1=parrten1.findall(title) parrten2=re.compile('(.*?)',re.S) til2=parrten2.findall(title) for t in ti1: tr=t.replace("","").replace("","") self.writeData(tr,page) for t in til2: tr=t.replace("

    ","").replace("

    ","").replace("
    ","").replace("
    ","").replace("&ldquo",""").replace("&rdquo",""") self.writeData(tr,page) def writeData(self,context,page): ''' 将最终爬去的内容写入文件中 :param context: 匹配好的内容 :param page: 当前爬去的页码数 ''' fileName = "di" + str(page) + "yehtml.txt" with open(fileName, "a") as file: file.writelines(context + "n") if __name__ == '__main__': n=neihanba() n.spider()

    爬取智联:

    #encoding=utf-8
    import urllib
    import urllib2
    
    import re
    
    
    class zhiLian():
      def spider(self,position,workPlace):
        '''
        爬虫的主调度器
        :param position: 职位
        :param workPlace: 工作地点
        '''
        url="http://sou.zhaopin.com/jobs/searchresult.ashx?"
        url+=urllib.urlencode({"jl":workPlace})
        url+="&"
        url+=urllib.urlencode({"kw":position})
        isflow=True#是否进行下一页的爬去
        page=1
        while isflow:
          url+="&"+str(page)
          html=self.load(url)
          self.deal1(html,page)
          panduan = raw_input("是否继续爬虫下一页(y/n)!")
          if panduan == "y":
     isflow = True
     page += 1
          else:
     isflow = False
      def load(self,url):
        '''
        针对url地址进行全部爬去
        :param url: url地址
        :return: 返回爬去的内容
        '''
        header = {
          "User-Agent": " Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36"
        }
        request = urllib2.Request(url, headers=header)
        response = urllib2.urlopen(request)
        html = response.read()
        return html
      def deal1(self,html,page):
        '''
    
        对之前爬去的内容进行正则匹配,匹配职位所对应的链接
        :param html:之前爬去的内容
        :param page: 正在爬去的页码
        '''
        parrten=re.compile('.*?',re.S)
        til=parrten.findall(html)#爬去链接
        for t in til:
          self.deal2(t,page)
      def deal2(self,t,page):
        '''
        进行二次爬虫,然后在新的页面中对公司、薪资、工作经验进行匹配
        :param t: url地址
        :param page: 当前匹配的页数
        '''
        html=self.load(t)#返回二次爬虫的内容
        parrten1=re.compile('(.*?)s+.*?',re.S)
        parrten2=re.compile('
  • 职位月薪:(.*?) .*?
  • ',re.S) parrent3=re.compile('
  • 工作经验:(.*?)
  • ',re.S) til1=parrten1.findall(html) til2=parrten2.findall(html) til3=parrent3.findall(html) str="" for t in til1: t=t.replace('',"") str+=t str+="t" for t in til2: str+=t str += "t" for t in til3: str+=t self.writeData(str,page) def writeData(self,context,page): ''' 将最终爬去的内容写入文件中 :param context: 匹配好的内容 :param page: 当前爬去的页码数 ''' fileName = "di" + str(page) + "yehtml.txt" with open(fileName, "a") as file: file.writelines(context + "n") if __name__ == '__main__': position=raw_input("请输入职位:") workPlace=raw_input("请输入工作地点:") z=zhiLian() z.spider(position,workPlace)

    爬取贴吧:

    #encoding=utf-8
    import urllib
    import urllib2
    
    import re
    
    
    class teiba():
      def spider(self,name,startPage,endPage):
        url="http://tieba.baidu.com/f?ie=utf-8&"
        url+=urllib.urlencode({"kw":name})
        for page in range(startPage,endPage+1):
          pn=50*(page-1)
          urlFull=url+"&"+urllib.urlencode({"pn":pn})
          html=self.loadPage(url)
          self.dealPage(html,page)
    
      def loadPage(self,url):
        header={
          "User-Agent":" Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36"
        }
        request=urllib2.Request(url,headers=header)
        response=urllib2.urlopen(request)
        html=response.read()
        return html
      def dealPage(self,html,page):
        partten=re.compile(r'(.*?)',re.S)
        titleList=partten.findall(html)
        rstr=r'#(.*?)#'
        for title in titleList:
          title=re.sub(rstr,"",title)
          self.writePage(title,page)
      def writePage(self,context,page):
        fileName="di"+str(page)+"yehtml.txt"
        with open(fileName,"a") as file:
          file.writelines(context+"n")
    if __name__ == '__main__':
      name=raw_input("请输入贴吧名:")
      startPage=raw_input("请输入起始页:")
      endPage=raw_input("请输入终止页:")
      t=teiba()
      t.spider(name,int(startPage),int(endPage))

    以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持考高分网。

    欢迎分享,转载请注明来源:内存溢出

    原文地址: http://outofmemory.cn/zaji/3244085.html

    (0)
    打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
    上一篇 2022-10-04
    下一篇 2022-10-04

    发表评论

    登录后才能评论

    评论列表(0条)

    保存