Selenium快速使用代理IP
2020/9/3 8:03:39
本文主要是介绍Selenium快速使用代理IP,对大家解决编程问题具有一定的参考价值,需要的程序猿们随着小编来一起学习吧!
由于部分网站反爬较多,如果采集需要做网站js等多种策略分析,导致研发的工作量急剧增加,为了快速启动项目采集数据,可以使用 Selenium自动化测试工具,模拟用户终端,结合爬虫代理IP,方便快捷的采集数据。
1. Selenium爬虫采集的优点:
(1)研发投入少,代码易维护
(2)regression方便
(3)可扩展性好
(4)采集效果好
2、Selenium环境搭建
安装
pip3 install -r requirements.txt
安装chrome并下载对应版本的chrome deriver
下载chrome https://www.google.com/chrome/
下载对应版本 driver https://chromedriver.chromium.org/downloads
基础配置
接口基本配置
# Redis数据库地址REDIS_HOST = 'localhost'# Redis端口REDIS_PORT = 6379# Redis密码,如无填NoneREDIS_PASSWORD = None# 产生器使用的浏览器BROWSER_TYPE = 'Chrome'# 产生器类,如扩展其他站点,请在此配置GENERATOR_MAP = { 'weibo': 'WeiboCookiesGenerator'}# 测试类,如扩展其他站点,请在此配置TESTER_MAP = { 'weibo': 'WeiboValidTester'}# 检测器检测接口TEST_URL_MAP = { 'weibo': 'https://m.weibo.cn/api/container/getIndex?uid=1804544030&type=uid&page=1&containerid=1076031804544030'}# 产生器和验证器循环周期CYCLE = 120# API地址和端口API_HOST = '0.0.0.0'API_PORT = 5000
进程开关
在config.py修改
# 产生器开关,模拟登录添加CookiesGENERATOR_PROCESS = True# 验证器开关,循环检测数据库中Cookies是否可用,不可用删除VALID_PROCESS = False# API接口服务API_PROCESS = False
导入账号
python3 importer.py
请输入账号密码组, 输入exit退出读入180000000----16yun账号 180000000 密码 16yun录入成功exit
运行
请先导入一部分账号之后再运行,运行命令:
python3 run.py
运行效果
三个进程全部开启:
API接口开始运行 * Running on http://0.0.0.0:5000/ (Press CTRL+C to quit)Cookies生成进程开始运行Cookies检测进程开始运行正在生成Cookies 账号 180000000 密码 16yun正在测试Cookies 用户名 180000000Cookies有效 180000000
3.下面提供Selenium使用代理IP的demo:
import os
import time
import zipfile
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
class GenCookies(object):
# 随机useragent
USER_AGENT = open('useragents.txt').readlines()
# 代理服务器(产品官网 www.16yun.cn)
PROXY_HOST = 't.16yun.cn' # proxy or host
PROXY_PORT = 31111 # port
PROXY_USER = 'USERNAME' # username
PROXY_PASS = 'PASSWORD' # password
@classmethod
def get_chromedriver(cls, use_proxy=False, user_agent=None):
manifest_json = """
{
"version": "1.0.0",
"manifest_version": 2,
"name": "Chrome Proxy",
"permissions": [
"proxy",
"tabs",
"unlimitedStorage",
"storage",
"<all_urls>",
"webRequest",
"webRequestBlocking"
],
"background": {
"scripts": ["background.js"]
},
"minimum_chrome_version":"22.0.0"
}
"""
background_js = """
var config = {
mode: "fixed_servers",
rules: {
singleProxy: {
scheme: "http",
host: "%s",
port: parseInt(%s)
},
bypassList: ["localhost"]
}
};
chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});
function callbackFn(details) {
return {
authCredentials: {
username: "%s",
password: "%s"
}
};
}
chrome.webRequest.onAuthRequired.addListener(
callbackFn,
{urls: ["<all_urls>"]},
['blocking']
);
""" % (cls.PROXY_HOST, cls.PROXY_PORT, cls.PROXY_USER, cls.PROXY_PASS)
path = os.path.dirname(os.path.abspath(__file__))
chrome_options = webdriver.ChromeOptions()
# 关闭webdriver的一些标志
# chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
if use_proxy:
pluginfile = 'proxy_auth_plugin.zip'
with zipfile.ZipFile(pluginfile, 'w') as zp:
zp.writestr("manifest.json", manifest_json)
zp.writestr("background.js", background_js)
chrome_options.add_extension(pluginfile)
if user_agent:
chrome_options.add_argument('--user-agent=%s' % user_agent)
driver = webdriver.Chrome(
os.path.join(path, 'chromedriver'),
chrome_options=chrome_options)
# 修改webdriver get属性
# script = '''
# Object.defineProperty(navigator, 'webdriver', {
# get: () => undefined
# })
# '''
# driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": script})
return driver
def __init__(self, username, password):
# 登录example网站
self.url = 'https://passport.example.cn/signin/login?entry=example&r=https://m.example.cn/'
self.browser = self.get_chromedriver(use_proxy=True, user_agent=self.USER_AGENT)
self.wait = WebDriverWait(self.browser, 20)
self.username = username
self.password = password
def open(self):
"""
打开网页输入用户名密码并点击
:return: None
"""
self.browser.delete_all_cookies()
self.browser.get(self.url)
username = self.wait.until(EC.presence_of_element_located((By.ID, 'loginName')))
password = self.wait.until(EC.presence_of_element_located((By.ID, 'loginPassword')))
submit = self.wait.until(EC.element_to_be_clickable((By.ID, 'loginAction')))
username.send_keys(self.username)
password.send_keys(self.password)
time.sleep(1)
submit.click()
def password_error(self):
"""
判断是否密码错误
:return:
"""
try:
return WebDriverWait(self.browser, 5).until(
EC.text_to_be_present_in_element((By.ID, 'errorMsg'), '用户名或密码错误'))
except TimeoutException:
return False
def get_cookies(self):
"""
获取Cookies
:return:
"""
return self.browser.get_cookies()
def main(self):
"""
入口
:return:
"""
self.open()
if self.password_error():
return {
'status': 2,
'content': '用户名或密码错误'
}
cookies = self.get_cookies()
return {
'status': 1,
'content': cookies
}
if __name__ == '__main__':
result = GenCookies(
username='180000000',
password='16yun',
).main()
print(result)
这篇关于Selenium快速使用代理IP的文章就介绍到这儿,希望我们推荐的文章对大家有所帮助,也希望大家多多支持为之网!
- 2024-11-24Java中定时任务实现方式及源码剖析
- 2024-11-24Java中定时任务实现方式及源码剖析
- 2024-11-24鸿蒙原生开发手记:03-元服务开发全流程(开发元服务,只需要看这一篇文章)
- 2024-11-24细说敏捷:敏捷四会之每日站会
- 2024-11-23Springboot应用的多环境打包入门
- 2024-11-23Springboot应用的生产发布入门教程
- 2024-11-23Python编程入门指南
- 2024-11-23Java创业入门:从零开始的编程之旅
- 2024-11-23Java创业入门:新手必读的Java编程与创业指南
- 2024-11-23Java对接阿里云智能语音服务入门详解