Source code for nbodykit.tutorials.wget

# Found from https://gist.github.com/remram44/6540454

from six.moves.html_parser import HTMLParser
from six.moves.urllib.error import HTTPError
from six.moves.urllib.request import urlopen
from six import string_types, PY2
import os
import re


# where the main nbodykit data examples live
data_url = "http://portal.nersc.gov/project/m3035/nbodykit/example-data"

re_url = re.compile(r'^(([a-zA-Z_-]+)://([^/]+))(/.*)?$')

def resolve_link(link, url):
    m = re_url.match(link)
    if m is not None:
        if not m.group(4):
            # http://domain -> http://domain/
            return link + '/'
        else:
            return link
    elif link[0] == '/':
        # /some/path
        murl = re_url.match(url)
        return murl.group(1) + link
    else:
        # relative/path
        if url[-1] == '/':
            return url + link
        else:
            return url + '/' + link


[docs]class ListingParser(HTMLParser): """Parses an HTML file and build a list of links. Links are stored into the 'links' set. They are resolved into absolute links. """ def __init__(self, url): HTMLParser.__init__(self) if url[-1] != '/': url += '/' self.__url = url self.links = set() def handle_starttag(self, tag, attrs): if tag == 'a': for key, value in attrs: if key == 'href': if not value: continue value = resolve_link(value, self.__url) self.links.add(value) break
[docs]def mirror(url, target=None): """ Mirror a URL recursively to a local target. If ``target`` is not supplied, the last part of the url is used as the target. Parameters ---------- url : str the URL to download target : str, optional the local file target to save the url to; if not provided, the last part of the url is used. """ if target is None: target = os.path.normpath(url).split(os.path.sep)[-1] def mkdir(): if not mkdir.done: try: os.mkdir(target) except OSError: pass mkdir.done = True mkdir.done = False # open the URL so we can parse it response = urlopen(url) # HTML file --> keep parsing info = response.info() content_type = info.type if PY2 else info.get_content_type() if content_type == 'text/html': contents = response.read().decode() parser = ListingParser(url) parser.feed(contents) for link in parser.links: link = resolve_link(link, url) if link[-1] == '/': link = link[:-1] if not link.startswith(url): continue name = link.rsplit('/', 1)[1] if '?' in name: continue mkdir() mirror(link, os.path.join(target, name)) if not mkdir.done: # We didn't find anything to write inside this directory # Maybe it's a HTML file? if url[-1] != '/': end = target[-5:].lower() if not (end.endswith('.htm') or end.endswith('.html')): target = target + '.html' with open(target, 'wb') as fp: fp.write(contents.encode()) # just download the file else: buffer_size = 4096*32 with open(target, 'wb') as fp: chunk = response.read(buffer_size) while chunk: fp.write(chunk) chunk = response.read(buffer_size)
[docs]def available_examples(): """ Return a list of available example data files from the nbodykit data repository on NERSC. Returns ------- examples : list list of the available file names for download """ # read the contents of the main data URL response = urlopen(data_url) contents = response.read().decode() # parse the available files parser = ListingParser(data_url) parser.feed(contents) # get relative paths and remove bad links available = [os.path.relpath(link, data_url) for link in parser.links] available = [link for link in available if not any(link.startswith(bad) for bad in ['.', '?'])] return sorted(available)
[docs]def download_example_data(filenames, download_dirname=None): """ Download a data file from the nbodykit repository of example data. For a list of valid file names, see :func:`available_examples`. Parameters ---------- filenames : str, list of str the name(s) of the example file to download (relative to the path of the nbodykit repository); see :func:`available_examples` for the example file names download_dirname : str, optional a local directory to download the file to; if not specified, the file will be downloaded to the current working directory """ if isinstance(filenames, string_types): filenames = [filenames] # make sure the download directory exists if download_dirname is not None: if not os.path.isdir(download_dirname): raise ValueError("specified download directory is not valid") # download all requested filenames for filename in filenames: # where we are saving locally if download_dirname is not None: target = os.path.join(download_dirname, filename) else: target = None # the full url to the data we want url = os.path.join(data_url, filename) # try to mirror locally try: mirror(url, target=target) except HTTPError as err: # if not found, print available file names, else just raise if err.code == 404: args = (filename, str(available_examples())) raise ValueError("no such example file '%s'\n\navailable examples are: %s" % args) else: raise