1
我正在使用此庫的多處理器from multiprocessing import Pool
。Python:如何有效地運行多個PhantomJS實例?
雖然我使用requests
,我想一些數據在彈出加載使用硒。沒有進入內存泄漏,使用Phantomjs最好的方法是什麼?
我正在使用此庫的多處理器from multiprocessing import Pool
。Python:如何有效地運行多個PhantomJS實例?
雖然我使用requests
,我想一些數據在彈出加載使用硒。沒有進入內存泄漏,使用Phantomjs最好的方法是什麼?
的基本思路大致翻譯可能是這樣的:
from __future__ import unicode_literals
import logging
from werkzeug.routing import Map
from werkzeug.exceptions import HTTPException
from werkzeug.wrappers import Request
class WebApp(object):
def __init__(self, **kw):
self.log = logging.getLogger(__name__)
def __call__(self, environ, start_response):
return self.wsgi_app(environ, start_response)
def wsgi_app(self, environ, start_response):
request = Request(environ)
response = self.dispatch_request(request)
return response(environ, start_response)
def dispatch_request(self, request):
adapter = self.url_map.bind_to_environ(request.environ)
try:
endpoint, values = adapter.match()
method = getattr(self, 'endpoint_{}'.format(endpoint))
return method(adapter, request, **values)
except HTTPException, e:
return e
url_map = Map([])
from pyvirtualdisplay import Display
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from subprocess import Popen, PIPE
import multiprocessing
display = Display(visible=0, size=(800, 600))
display.start()
def get_proxy_obj():
proxy = '123.456.789.012'
proxyobj = Proxy({
'proxyType': ProxyType.MANUAL,
'httpProxy': proxy,
'ftpProxy': proxy,
'sslProxy': proxy,
'noProxy': '' # set this value as desired
})
capabilities = DesiredCapabilities().FIREFOX
capabilities['acceptSslCerts'] = True
proxyobj.add_to_capabilities(capabilities)
return capabilities
drivers = [
Firefox(FirefoxProfile('/etc/firefox/u2vgyy61.Proxied_User/'),
capabilities=get_capabilities()),
Firefox(FirefoxProfile('/etc/firefox/u2vgyy61.Proxied_User/'),
capabilities=get_capabilities()),
Firefox(FirefoxProfile('/etc/firefox/u2vgyy61.Proxied_User/'),
capabilities=get_capabilities())
]
class Routes(WebApp):
def endpoint_get_response(self, adapter, request, **values):
url = request.values.get("query_param_here","")
if url:
# something better here
while True:
try:
driver = driver.pop()
resposne_txt = driver.get(url)
# response_txt = Popen(['docker', "exec", "-it", "selenium_phantom", url]).communicate()[0]
drivers.append(driver)
return Response(response_text)
except:
sleep(1)
continue
else:
return Response("Not", status=400)
url_map = Map([
Rule('/get_response', endpoint='get_response', methods=['GET']),
])
例如用法:
curl http://node1/get_response?query_param_here=http://stackoverflow.com
curl http://node2/get_response?query_param_here=http://stackoverflow.com
curl http://node3/get_response?query_param_here=http://stackoverflow.com
curl http://node4/get_response?query_param_here=http://stackoverflow.com
...
and so on
與負載均衡器盈,如:
用'maxInstances'設置一個硒網格設置爲每個節點可以處理的東西,這樣您可以根據需要添加節點?你在找多少個實例?每分鐘有多少個請求?如果這不是一種選擇,或許可以考慮重用硒會議並在他們提出請求時輪流通過它們? – jmunsch
@ jmunsch不知道'Selenium Grid'。因爲我願意一次使用5個實例的並行處理。每個請求將有2-5秒的延遲。 – Volatil3
@jmunsch第二,我需要一個基於服務器的解決方案,這個網格似乎要安裝Java – Volatil3