Source code for nbodykit.tutorials.wget
# Found from https://gist.github.com/remram44/6540454
from six.moves.html_parser import HTMLParser
from six.moves.urllib.error import HTTPError
from six.moves.urllib.request import urlopen
from six import string_types, PY2
import os
import re
# where the main nbodykit data examples live
data_url = "http://portal.nersc.gov/project/m3035/nbodykit/example-data"
re_url = re.compile(r'^(([a-zA-Z_-]+)://([^/]+))(/.*)?$')
def resolve_link(link, url):
m = re_url.match(link)
if m is not None:
if not m.group(4):
# http://domain -> http://domain/
return link + '/'
else:
return link
elif link[0] == '/':
# /some/path
murl = re_url.match(url)
return murl.group(1) + link
else:
# relative/path
if url[-1] == '/':
return url + link
else:
return url + '/' + link
[docs]class ListingParser(HTMLParser):
"""Parses an HTML file and build a list of links.
Links are stored into the 'links' set. They are resolved into absolute
links.
"""
def __init__(self, url):
HTMLParser.__init__(self)
if url[-1] != '/':
url += '/'
self.__url = url
self.links = set()
def handle_starttag(self, tag, attrs):
if tag == 'a':
for key, value in attrs:
if key == 'href':
if not value:
continue
value = resolve_link(value, self.__url)
self.links.add(value)
break
[docs]def mirror(url, target=None):
"""
Mirror a URL recursively to a local target.
If ``target`` is not supplied, the last part of the url is used as
the target.
Parameters
----------
url : str
the URL to download
target : str, optional
the local file target to save the url to; if not provided, the
last part of the url is used.
"""
if target is None:
target = os.path.normpath(url).split(os.path.sep)[-1]
def mkdir():
if not mkdir.done:
try:
os.mkdir(target)
except OSError:
pass
mkdir.done = True
mkdir.done = False
# open the URL so we can parse it
response = urlopen(url)
# HTML file --> keep parsing
info = response.info()
content_type = info.type if PY2 else info.get_content_type()
if content_type == 'text/html':
contents = response.read().decode()
parser = ListingParser(url)
parser.feed(contents)
for link in parser.links:
link = resolve_link(link, url)
if link[-1] == '/':
link = link[:-1]
if not link.startswith(url):
continue
name = link.rsplit('/', 1)[1]
if '?' in name:
continue
mkdir()
mirror(link, os.path.join(target, name))
if not mkdir.done:
# We didn't find anything to write inside this directory
# Maybe it's a HTML file?
if url[-1] != '/':
end = target[-5:].lower()
if not (end.endswith('.htm') or end.endswith('.html')):
target = target + '.html'
with open(target, 'wb') as fp:
fp.write(contents.encode())
# just download the file
else:
buffer_size = 4096*32
with open(target, 'wb') as fp:
chunk = response.read(buffer_size)
while chunk:
fp.write(chunk)
chunk = response.read(buffer_size)
[docs]def available_examples():
"""
Return a list of available example data files from the nbodykit
data repository on NERSC.
Returns
-------
examples : list
list of the available file names for download
"""
# read the contents of the main data URL
response = urlopen(data_url)
contents = response.read().decode()
# parse the available files
parser = ListingParser(data_url)
parser.feed(contents)
# get relative paths and remove bad links
available = [os.path.relpath(link, data_url) for link in parser.links]
available = [link for link in available if not any(link.startswith(bad) for bad in ['.', '?'])]
return sorted(available)
[docs]def download_example_data(filenames, download_dirname=None):
"""
Download a data file from the nbodykit repository of example data.
For a list of valid file names, see :func:`available_examples`.
Parameters
----------
filenames : str, list of str
the name(s) of the example file to download (relative to the path of the
nbodykit repository); see :func:`available_examples` for the example
file names
download_dirname : str, optional
a local directory to download the file to; if not specified, the
file will be downloaded to the current working directory
"""
if isinstance(filenames, string_types):
filenames = [filenames]
# make sure the download directory exists
if download_dirname is not None:
if not os.path.isdir(download_dirname):
raise ValueError("specified download directory is not valid")
# download all requested filenames
for filename in filenames:
# where we are saving locally
if download_dirname is not None:
target = os.path.join(download_dirname, filename)
else:
target = None
# the full url to the data we want
url = os.path.join(data_url, filename)
# try to mirror locally
try:
mirror(url, target=target)
except HTTPError as err:
# if not found, print available file names, else just raise
if err.code == 404:
args = (filename, str(available_examples()))
raise ValueError("no such example file '%s'\n\navailable examples are: %s" % args)
else:
raise