python下 selenium与chrome结合进行网页爬取,怎么设置代理IP

python下 selenium与chrome结合进行网页爬取,怎么设置代理IP,第1张

#coding:utf-8

import sys,re,random,time,os

import socket

from socket import error as socket_error

import threading

import urllib2,cookielib

from bs4 import BeautifulSoup

from selenium import webdriver

from selenium.webdriver.common.proxy import *

from selenium.webdriver.firefox.firefox_profile import FirefoxProfile

from selenium.webdriver.firefox.firefox_binary import FirefoxBinary

proxyFilePath = time.strftime("%Y%m%d")

def testSocket(ip, port):

'''

socket连接测试,用来检测proxy ip,port 是否可以正常连接

'''

print '正在测试socket连接...'

sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

try:

sock.settimeout(10)

sock.connect((ip, int(port)))

#sock.send('meta')

sock.close()

print ip+':'+port+'--status:ok'

return 1

except socket_error as serr: # connection error

sock.close()

print ip+':'+port+'--status:error--Connection refused.'

return 0

def getDriver(httpProxy = '', type='Firefox'):

if type == 'Firefox':

proxy = Proxy({

'proxyType': ProxyType.MANUAL,

'httpProxy': httpProxy,

'ftpProxy': httpProxy,

'sslProxy': httpProxy,

'noProxy': '' # set this value as desired

})

firefox_profile = FirefoxProfile()

#firefox_profile.add_extension("firefox_extensions/adblock_plus-2.5.1-sm+tb+an+fx.xpi")

firefox_profile.add_extension("firefox_extensions/webdriver_element_locator-1.rev312-fx.xpi")

firefox_profile.set_preference("browser.download.folderList",2)

firefox_profile.set_preference("webdriver.load.strategy", "unstable")

#driver = webdriver.Firefox(firefox_profile = firefox_profile, proxy=proxy, firefox_binary=FirefoxBinary('/usr/bin/firefox'))

#driver = webdriver.Firefox(firefox_profile = firefox_profile, proxy=proxy, firefox_binary=FirefoxBinary("/cygdrive/c/Program\ Files\ (x86)/Mozilla\ Firefox/firefox.exe"))

driver = webdriver.Firefox(firefox_profile = firefox_profile, proxy=proxy)

elif type == 'PhantomJS': # PhantomJS

service_args = [

'--proxy='+httpProxy,

'--proxy-type=http',

]

webdriver.DesiredCapabilities.PHANTOMJS['phantomjs.page.customHeaders.Accept'] = 'text/html,application/xhtml+xml,application/xmlq=0.9,image/webp,*/*q=0.8'

webdriver.DesiredCapabilities.PHANTOMJS['phantomjs.page.customHeaders.User-Agent'] = 'Mozilla/5.0 (X11Windows x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.117 Safari/537.36'

driver = webdriver.PhantomJS(executable_path='windows/phantomjs.exe', service_args=service_args)

else: # Chrome

chrome_options = webdriver.ChromeOptions()

#chrome_options.add_extension('firefox_extensions/adblockplus_1_7_4.crx')

chrome_options.add_argument('--proxy-server=%s' % httpProxy)

driver = webdriver.Chrome(executable_path='windows/chromedriver.exe', chrome_options=chrome_options)

return driver

这个代理是在IE里设置的。 不是用chrome,方向错了。 你搜索一下《使用Python给IE设置代理》。 可以找到答案。下面一些代码,看看有没用。

def changeIEProxy(keyName, keyValue):

   pathInReg = 'Software\Microsoft\Windows\CurrentVersion\Internet Settings'

   key = win32api.RegOpenKey(win32con.HKEY_CURRENT_USER,pathInReg, 0, win32con.KEY_ALL_ACCESS)

   win32api.RegSetValueEx(key, keyName, 0, win32con.REG_SZ, keyValue)

   win32api.RegCloseKey(key)

所说IE浏览器的代理内容在注册表中

『HKEYCURRENTUSER\Software\Microsoft\Windows\CurrentVersion\Internet Settings』

这里存着


欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/bake/7929697.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2023-04-11
下一篇 2023-04-11

发表评论

登录后才能评论

评论列表(0条)

保存