Skip to content

Commit 91c7f32

Browse files
committed
Refactor http handling in pex
This reworks http handling in pex to be more performant and to allow for alternate implementations and connection disciplines. It also fixes the general flakiness around untranslatable packages. The pex.http submodule is gone and each of its packages are moved into pex directly: pex.crawler pex.link pex.http Crawler is out of the business of caching -- instead this is handed off to the http layer. Link is out of the business of fetching -- it is now only a wrapper around a URL. Web/CachedWeb is killed in favor of a new class pex.http.Context. Subclasses need only implement 'open(link)' and return a file-like object. There are three concrete implementations: - UrllibContext (python standard library http context) - RequestsContext (requests-based http context) - CachingRequestsContext (a requests-based http context with CacheControl if available) The Requests-based contexts also support https cert validation and hash fragment verification (via StreamFilelike) bringing it up to security parity with pip. The rest of the API is modified as minimally as possible to accommodate the above. Users consuming the 'pex' binary and those who just use 'resolve' with default implementations will be unaffected. Changes that will break pants: Obtainer now takes a context instead of a crawler (don't dwell on this too much -- Obtainer will be deleted altogether in the next review.) Translators no longer take conn_timeout since they no longer do any fetching -- this responsibility is delegated to the Context implementations. Increments to 0.8.0-rc0. Testing Done: pex.{crawler,link,http} have improved coverage over their predecessors. The only thing I can think that might be worse is that UrllibContext does nothing to try to recover from errors -- it's mostly assumed that people will use the RequestsContext. Reviewed at https://rbcommons.com/s/twitter/r/778/
1 parent 32160e7 commit 91c7f32

28 files changed

+749
-882
lines changed

.travis.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ python: 2.7
33
env:
44
- TOXENV=py26
55
- TOXENV=py27
6+
- TOXENV=py27-requests
67
- TOXENV=py33
78
- TOXENV=py34
89
- TOXENV=pypy

docs/api/index.rst

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,3 @@ PEX API Reference
66

77
pex
88
pex.bin
9-
pex.http

docs/api/pex.http.rst

Lines changed: 0 additions & 46 deletions
This file was deleted.

docs/api/pex.rst

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,17 @@
11
pex package
22
===========
33

4-
Subpackages
5-
-----------
6-
7-
.. toctree::
8-
9-
pex.bin
10-
pex.http
11-
124
Submodules
135
----------
146

7+
pex.archiver module
8+
---------------
9+
10+
.. automodule:: pex.archiver
11+
:members:
12+
:undoc-members:
13+
:show-inheritance:
14+
1515
pex.base module
1616
---------------
1717

@@ -36,6 +36,14 @@ pex.compatibility module
3636
:undoc-members:
3737
:show-inheritance:
3838

39+
pex.crawler module
40+
------------------------
41+
42+
.. automodule:: pex.crawler
43+
:members:
44+
:undoc-members:
45+
:show-inheritance:
46+
3947
pex.environment module
4048
----------------------
4149

@@ -60,6 +68,14 @@ pex.finders module
6068
:undoc-members:
6169
:show-inheritance:
6270

71+
pex.http module
72+
--------------------
73+
74+
.. automodule:: pex.http
75+
:members:
76+
:undoc-members:
77+
:show-inheritance:
78+
6379
pex.installer module
6480
--------------------
6581

@@ -76,6 +92,14 @@ pex.interpreter module
7692
:undoc-members:
7793
:show-inheritance:
7894

95+
pex.link module
96+
----------------------
97+
98+
.. automodule:: pex.link
99+
:members:
100+
:undoc-members:
101+
:show-inheritance:
102+
79103
pex.marshaller module
80104
---------------------
81105

pex/bin/pex.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -245,10 +245,10 @@ def build_obtainer(options):
245245
package_precedence = (EggPackage, SourcePackage)
246246

247247
obtainer = CachingObtainer(
248-
install_cache=options.cache_dir,
249248
fetchers=fetchers,
250249
translators=translator,
251-
precedence=package_precedence)
250+
precedence=package_precedence,
251+
cache=options.cache_dir)
252252

253253
return obtainer
254254

pex/crawler.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
# Copyright 2014 Pants project contributors (see CONTRIBUTORS.md).
2+
# Licensed under the Apache License, Version 2.0 (see LICENSE).
3+
4+
import os
5+
import re
6+
import threading
7+
8+
from .compatibility import PY3
9+
from .link import Link
10+
from .http import Context
11+
from .tracer import TRACER
12+
13+
if PY3:
14+
from queue import Empty, Queue
15+
from urllib.parse import urlparse
16+
else:
17+
from Queue import Empty, Queue
18+
from urlparse import urlparse
19+
20+
21+
class PageParser(object):
22+
HREF_RE = re.compile(r"""href=(?:"([^"]*)"|\'([^\']*)\'|([^>\s\n]*))""", re.I | re.S)
23+
REL_RE = re.compile(r"""<[^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*>""", re.I)
24+
REL_SKIP_EXTENSIONS = frozenset(['.zip', '.tar', '.tar.gz', '.tar.bz2', '.tgz', '.exe'])
25+
REL_TYPES = frozenset(['homepage', 'download'])
26+
27+
@classmethod
28+
def href_match_to_url(cls, match):
29+
def pick(group):
30+
return '' if group is None else group
31+
return pick(match.group(1)) or pick(match.group(2)) or pick(match.group(3))
32+
33+
@classmethod
34+
def rel_links(cls, page):
35+
"""return rel= links that should be scraped, skipping obviously data links."""
36+
for match in cls.REL_RE.finditer(page):
37+
href, rel = match.group(0), match.group(1)
38+
if rel not in cls.REL_TYPES:
39+
continue
40+
href_match = cls.HREF_RE.search(href)
41+
if href_match:
42+
href = cls.href_match_to_url(href_match)
43+
parsed_href = urlparse(href)
44+
if any(parsed_href.path.endswith(ext) for ext in cls.REL_SKIP_EXTENSIONS):
45+
continue
46+
yield href
47+
48+
@classmethod
49+
def links(cls, page):
50+
"""return all links on a page, including potentially rel= links."""
51+
for match in cls.HREF_RE.finditer(page):
52+
yield cls.href_match_to_url(match)
53+
54+
55+
def partition(L, pred):
56+
return filter(lambda v: not pred(v), L), filter(lambda v: pred(v), L)
57+
58+
59+
class Crawler(object):
60+
@classmethod
61+
def crawl_local(cls, link):
62+
try:
63+
dirents = os.listdir(link.path)
64+
# except OSError as e:
65+
except Exception as e:
66+
TRACER.log('Failed to read %s: %s' % (link.path, e), V=1)
67+
return set(), set()
68+
files, dirs = partition([os.path.join(link.path, fn) for fn in dirents], os.path.isdir)
69+
return set(map(Link.from_filename, files)), set(map(Link.from_filename, dirs))
70+
71+
@classmethod
72+
def crawl_remote(cls, context, link):
73+
try:
74+
content = context.read(link)
75+
# except context.Error as e:
76+
except Exception as e:
77+
TRACER.log('Failed to read %s: %s' % (link.url, e), V=1)
78+
return set(), set()
79+
links = set(link.join(href) for href in PageParser.links(content))
80+
rel_links = set(link.join(href) for href in PageParser.rel_links(content))
81+
return links, rel_links
82+
83+
@classmethod
84+
def crawl_link(cls, context, link):
85+
if link.local:
86+
return cls.crawl_local(link)
87+
elif link.remote:
88+
return cls.crawl_remote(context, link)
89+
else:
90+
TRACER.log('Failed to crawl %s: unknown scheme %s' % (link.url, link.scheme))
91+
return set(), set()
92+
93+
def __init__(self, context=None, threads=1):
94+
self._threads = threads
95+
self.context = context or Context.get()
96+
97+
def crawl(self, link_or_links, follow_links=False):
98+
links, seen = set(), set()
99+
queue = Queue()
100+
converged = threading.Event()
101+
102+
def execute():
103+
while not converged.is_set():
104+
try:
105+
link = queue.get(timeout=0.1)
106+
except Empty:
107+
continue
108+
if link not in seen:
109+
seen.add(link)
110+
try:
111+
roots, rels = self.crawl_link(self.context, link)
112+
except Exception as e:
113+
TRACER.log('Unknown exception encountered: %s' % e)
114+
continue
115+
links.update(roots)
116+
if follow_links:
117+
for rel in rels:
118+
if rel not in seen:
119+
queue.put(rel)
120+
queue.task_done()
121+
122+
for link in Link.wrap_iterable(link_or_links):
123+
queue.put(link)
124+
125+
workers = []
126+
for _ in range(self._threads):
127+
worker = threading.Thread(target=execute)
128+
workers.append(worker)
129+
worker.daemon = True
130+
worker.start()
131+
132+
queue.join()
133+
converged.set()
134+
135+
for worker in workers:
136+
worker.join()
137+
138+
return links

0 commit comments

Comments
 (0)