最近抽空写了数据抓取微博用户发布在奉天承芸超话的视频,代码运行通过,抓取代码如下:
后续出个去掉抓取重复网页的优化版本,等我稍闲下来。
def run(self): s = requests.Session() #暂时先抓self.TotalPage=200页 for i in range(1,self.TotalPage): time.sleep(1) if self.end_flag : break if i == 1: self.url = self.basic_weibo_url else: self.url = self.basic_weibo_url + '&since_id=' + str(self.since_id) weibo_response = s.get(self.url,timeout=(3.15,10)) weibo_data = weibo_response.json() if weibo_response.status_code != 200: print("end process for reponse.status_code=", weibo_response.status_code) self.end_flag = True return self.end_flag if weibo_data['data'].get('cards'): cards = weibo_data['data']['cards'] if weibo_data['data'].get('cardlistInfo'): cardlistInfo = weibo_data['data']['cardlistInfo'] self.since_id = cardlistInfo.get('since_id') if self.since_id == '': print("end process for since_id is empty") self.end_flag = True return self.end_flag for card in cards: if card.get('mblog'): m_blog = card.get('mblog') text = m_blog.get('text') mid = m_blog.get('mid') source = m_blog.get('source') screen_name = m_blog.get('user').get('screen_name') # created_at = '' # edit_at = '' created_at = m_blog.get('created_at') edit_at = m_blog.get('edit_at') #发布在奉天承芸超话里面 if source == self.weibo_str: # print([mid,text,screen_name]) if m_blog.get('page_info'): page_info = m_blog.get('page_info') # content2 = '' content2 = page_info.get('content2') type = page_info.get('type') if page_info.get('type') == "video": page_url = page_info.get('page_url') # play_count = '' play_count = page_info.get('play_count') page_title = page_info.get('title') self.item_num = self.item_num + 1 if page_info.get('page_pic'): video_pic_url = '' page_pic = page_info.get('page_pic') video_pic_url = page_pic['url'] # print(video_pic_url) print("append") ws1.append( [str(self.item_num), page_title, screen_name, content2, "1-视频", "3-双人合作舞台-饭拍", "2-微博", page_url, video_pic_url, created_at, play_count, '', '']) wb.save('media.xlsx') else: print("end process for there is not cardlistInfo") self.end_flag = True return self.end_flag
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)