pex-tool
diff --git a/‎.travis.yml‎
Lines changed: 1 addition & 0 deletions b/‎.travis.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/api/index.rst‎
Lines changed: 0 additions & 1 deletion b/‎docs/api/index.rst‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎docs/api/pex.http.rst‎
Lines changed: 0 additions & 46 deletions b/‎docs/api/pex.http.rst‎
Lines changed: 0 additions & 46 deletions
diff --git a/‎docs/api/pex.rst‎
Lines changed: 32 additions & 8 deletions b/‎docs/api/pex.rst‎
Lines changed: 32 additions & 8 deletions
diff --git a/‎pex/bin/pex.py‎
Lines changed: 2 additions & 2 deletions b/‎pex/bin/pex.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎pex/crawler.py‎
Lines changed: 138 additions & 0 deletions b/‎pex/crawler.py‎
Lines changed: 138 additions & 0 deletions
@@ -3,6 +3,7 @@ python: 2.7
 env:
   - TOXENV=py26
   - TOXENV=py27
+  - TOXENV=py27-requests
   - TOXENV=py33
   - TOXENV=py34
   - TOXENV=pypy
 
@@ -6,4 +6,3 @@ PEX API Reference
 
     pex
     pex.bin
-    pex.http
@@ -1,17 +1,17 @@
 pex package
 ===========
 
-Subpackages
------------
-
-.. toctree::
-
-    pex.bin
-    pex.http
-
 Submodules
 ----------
 
+pex.archiver module
+---------------
+
+.. automodule:: pex.archiver
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 pex.base module
 ---------------
 
@@ -36,6 +36,14 @@ pex.compatibility module
     :undoc-members:
     :show-inheritance:
 
+pex.crawler module
+------------------------
+
+.. automodule:: pex.crawler
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 pex.environment module
 ----------------------
 
@@ -60,6 +68,14 @@ pex.finders module
     :undoc-members:
     :show-inheritance:
 
+pex.http module
+--------------------
+
+.. automodule:: pex.http
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 pex.installer module
 --------------------
 
@@ -76,6 +92,14 @@ pex.interpreter module
     :undoc-members:
     :show-inheritance:
 
+pex.link module
+----------------------
+
+.. automodule:: pex.link
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 pex.marshaller module
 ---------------------
 
 
@@ -245,10 +245,10 @@ def build_obtainer(options):
     package_precedence = (EggPackage, SourcePackage)
 
   obtainer = CachingObtainer(
-      install_cache=options.cache_dir,
       fetchers=fetchers,
       translators=translator,
-      precedence=package_precedence)
+      precedence=package_precedence,
+      cache=options.cache_dir)
 
   return obtainer
 
 
@@ -0,0 +1,138 @@
+# Copyright 2014 Pants project contributors (see CONTRIBUTORS.md).
+# Licensed under the Apache License, Version 2.0 (see LICENSE).
+
+import os
+import re
+import threading
+
+from .compatibility import PY3
+from .link import Link
+from .http import Context
+from .tracer import TRACER
+
+if PY3:
+  from queue import Empty, Queue
+  from urllib.parse import urlparse
+else:
+  from Queue import Empty, Queue
+  from urlparse import urlparse
+
+
+class PageParser(object):
+  HREF_RE = re.compile(r"""href=(?:"([^"]*)"|\'([^\']*)\'|([^>\s\n]*))""", re.I | re.S)
+  REL_RE = re.compile(r"""<[^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*>""", re.I)
+  REL_SKIP_EXTENSIONS = frozenset(['.zip', '.tar', '.tar.gz', '.tar.bz2', '.tgz', '.exe'])
+  REL_TYPES = frozenset(['homepage', 'download'])
+
+  @classmethod
+  def href_match_to_url(cls, match):
+    def pick(group):
+      return '' if group is None else group
+    return pick(match.group(1)) or pick(match.group(2)) or pick(match.group(3))
+
+  @classmethod
+  def rel_links(cls, page):
+    """return rel= links that should be scraped, skipping obviously data links."""
+    for match in cls.REL_RE.finditer(page):
+      href, rel = match.group(0), match.group(1)
+      if rel not in cls.REL_TYPES:
+        continue
+      href_match = cls.HREF_RE.search(href)
+      if href_match:
+        href = cls.href_match_to_url(href_match)
+        parsed_href = urlparse(href)
+        if any(parsed_href.path.endswith(ext) for ext in cls.REL_SKIP_EXTENSIONS):
+          continue
+        yield href
+
+  @classmethod
+  def links(cls, page):
+    """return all links on a page, including potentially rel= links."""
+    for match in cls.HREF_RE.finditer(page):
+      yield cls.href_match_to_url(match)
+
+
+def partition(L, pred):
+  return filter(lambda v: not pred(v), L), filter(lambda v: pred(v), L)
+
+
+class Crawler(object):
+  @classmethod
+  def crawl_local(cls, link):
+    try:
+      dirents = os.listdir(link.path)
+    # except OSError as e:
+    except Exception as e:
+      TRACER.log('Failed to read %s: %s' % (link.path, e), V=1)
+      return set(), set()
+    files, dirs = partition([os.path.join(link.path, fn) for fn in dirents], os.path.isdir)
+    return set(map(Link.from_filename, files)), set(map(Link.from_filename, dirs))
+
+  @classmethod
+  def crawl_remote(cls, context, link):
+    try:
+      content = context.read(link)
+    # except context.Error as e:
+    except Exception as e:
+      TRACER.log('Failed to read %s: %s' % (link.url, e), V=1)
+      return set(), set()
+    links = set(link.join(href) for href in PageParser.links(content))
+    rel_links = set(link.join(href) for href in PageParser.rel_links(content))
+    return links, rel_links
+
+  @classmethod
+  def crawl_link(cls, context, link):
+    if link.local:
+      return cls.crawl_local(link)
+    elif link.remote:
+      return cls.crawl_remote(context, link)
+    else:
+      TRACER.log('Failed to crawl %s: unknown scheme %s' % (link.url, link.scheme))
+      return set(), set()
+
+  def __init__(self, context=None, threads=1):
+    self._threads = threads
+    self.context = context or Context.get()
+
+  def crawl(self, link_or_links, follow_links=False):
+    links, seen = set(), set()
+    queue = Queue()
+    converged = threading.Event()
+
+    def execute():
+      while not converged.is_set():
+        try:
+          link = queue.get(timeout=0.1)
+        except Empty:
+          continue
+        if link not in seen:
+          seen.add(link)
+          try:
+            roots, rels = self.crawl_link(self.context, link)
+          except Exception as e:
+            TRACER.log('Unknown exception encountered: %s' % e)
+            continue
+          links.update(roots)
+          if follow_links:
+            for rel in rels:
+              if rel not in seen:
+                queue.put(rel)
+        queue.task_done()
+
+    for link in Link.wrap_iterable(link_or_links):
+      queue.put(link)
+
+    workers = []
+    for _ in range(self._threads):
+      worker = threading.Thread(target=execute)
+      workers.append(worker)
+      worker.daemon = True
+      worker.start()
+
+    queue.join()
+    converged.set()
+
+    for worker in workers:
+      worker.join()
+
+    return links
Original file line number	Diff line number	Diff line change
`@@ -6,4 +6,3 @@ PEX API Reference`
`6`	`6`
`7`	`7`	`pex`
`8`	`8`	`pex.bin`
`9`		`- pex.http`