Stuck on an issue?

Lightrun Answers was designed to reduce the constant googling that comes with debugging 3rd party libraries. It collects links to all the places you might be looking at while hunting down a tough bug.

And, if you’re still stuck at the end, we’re happy to hop on a call to see how we can help out.

connection pooling do not work when using proxy

See original GitHub issue

Scrapy create a new TCP4ClientEndpoint for each request when using proxy in ScrapyAgent while ProxyAgent(twisted) use key = ("http-proxy", self._proxyEndpoint) as connection pool key. It causes creating new connection for each request when using proxy， will get errno99: cannot assign requested address when all ports has been used (socket TIME_WAIT).

scrapy/core/downloader/handlers/http11.py

class ScrapyAgent(object):
    def _get_agent(self, request, timeout):
        bindaddress = request.meta.get('bindaddress') or self._bindAddress
        proxy = request.meta.get('proxy')
        if proxy:
            _, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
            scheme = _parse(request.url)[0]
            proxyHost = to_unicode(proxyHost)
            omitConnectTunnel = b'noconnect' in proxyParams
            if  scheme == b'https' and not omitConnectTunnel:
                proxyConf = (proxyHost, proxyPort,
                             request.headers.get(b'Proxy-Authorization', None))
                return self._TunnelingAgent(reactor, proxyConf,
                    contextFactory=self._contextFactory, connectTimeout=timeout,
                    bindAddress=bindaddress, pool=self._pool)
            else:
                endpoint = TCP4ClientEndpoint(reactor, proxyHost, proxyPort,
                    timeout=timeout, bindAddress=bindaddress)
                return self._ProxyAgent(endpoint)

        return self._Agent(reactor, contextFactory=self._contextFactory,
            connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)

twisted/web/client.py

@implementer(IAgent)
class ProxyAgent(_AgentBase):
    """
    An HTTP agent able to cross HTTP proxies.

    @ivar _proxyEndpoint: The endpoint used to connect to the proxy.

    @since: 11.1
    """

    def __init__(self, endpoint, reactor=None, pool=None):
        if reactor is None:
            from twisted.internet import reactor
        _AgentBase.__init__(self, reactor, pool)
        self._proxyEndpoint = endpoint


    def request(self, method, uri, headers=None, bodyProducer=None):
        """
        Issue a new request via the configured proxy.
        """
        # Cache *all* connections under the same key, since we are only
        # connecting to a single destination, the proxy:
        key = ("http-proxy", self._proxyEndpoint)

        # To support proxying HTTPS via CONNECT, we will use key
        # ("http-proxy-CONNECT", scheme, host, port), and an endpoint that
        # wraps _proxyEndpoint with an additional callback to do the CONNECT.
        return self._requestWithEndpoint(key, self._proxyEndpoint, method,
                                         URI.fromBytes(uri), headers,
                                         bodyProducer, uri)

Issue Analytics

State:
Created 6 years ago
Comments:13 (6 by maintainers)

Top GitHub Comments

1reaction

jdxin0commented, Jun 7, 2017

I used monkey patch to fix the proxy connection pooling problem. But it causes a lot of timeout(30s) errors when I am sure the request didn’t reach 30s. When I removed monkey patch, it changed to normal.

Here is the patch code.

from twisted.web.client import URI
from scrapy.core.downloader.handlers import http11
from scrapy.core.downloader.handlers.http11 import ProxyAgent, _parse, \
    to_unicode, reactor, TCP4ClientEndpoint, ScrapyAgent as _ScrapyAgent


class ScrapyProxyAgent(ProxyAgent):
    def request(self, method, uri, headers=None, bodyProducer=None):
        """
        Issue a new request via the configured proxy.
        """
        # Cache *all* connections under the same key, since we are only
        # connecting to a single destination, the proxy:
        key = (
        "http-proxy", self._proxyEndpoint._host, self._proxyEndpoint._port)
        return self._requestWithEndpoint(key, self._proxyEndpoint, method,
                                         URI.fromBytes(uri), headers,
                                         bodyProducer, uri)


class ScrapyAgent(_ScrapyAgent):
    _ProxyAgent = ScrapyProxyAgent
    
    def _get_agent(self, request, timeout):
        bindaddress = request.meta.get('bindaddress') or self._bindAddress
        proxy = request.meta.get('proxy')
        if proxy:
            _, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
            scheme = _parse(request.url)[0]
            proxyHost = to_unicode(proxyHost)
            omitConnectTunnel = b'noconnect' in proxyParams
            if scheme == b'https' and not omitConnectTunnel:
                proxyConf = (proxyHost, proxyPort,
                             request.headers.get(b'Proxy-Authorization', None))
                return self._TunnelingAgent(reactor, proxyConf,
                                            contextFactory=self._contextFactory,
                                            connectTimeout=timeout,
                                            bindAddress=bindaddress,
                                            pool=self._pool)
            else:
                endpoint = TCP4ClientEndpoint(reactor, proxyHost, proxyPort,
                                              timeout=timeout,
                                              bindAddress=bindaddress)
                return self._ProxyAgent(endpoint, pool=self._pool)
        
        return self._Agent(reactor, contextFactory=self._contextFactory,
                           connectTimeout=timeout, bindAddress=bindaddress,
                           pool=self._pool)
    
    
def patch_proxy():
    http11.ScrapyAgent = ScrapyAgent

0reactions

jdxin0commented, Sep 4, 2017

@redapple My problem is solved with your patch after solving self-implemented proxy server problem