# SPDX-FileCopyrightText: Copyright © 2022 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: BSD-3-Clause
"""This module contains instructions for documentation lookup."""
from __future__ import annotations # not required for Python >= 3.10
import collections.abc
import importlib.metadata
import importlib.resources
import json
import pathlib
import re
import shutil
import typing
import lxml.html
import packaging.version
import requests
from sphinx.util import logging
logger = logging.getLogger(__name__)
PackageDictionaryType = dict[str, dict[str, str]]
"""Type for the internal values of :py:class:`Catalog`"""
BUILTIN_CATALOG = importlib.resources.files(__package__).joinpath("catalog.json")
"""Base name for the catalog file distributed with this package."""
PEP440_RE = re.compile(
r"^\s*" + packaging.version.VERSION_PATTERN + r"\s*$",
re.VERBOSE | re.IGNORECASE,
)
"""Regular expression for matching PEP-440 version numbers."""
def _ensure_webdir(addr: str) -> str:
"""Ensures the web-address ends in a /, and contains ``objects.inv``"""
if addr.endswith(".html"):
addr = addr[: addr.rfind("/")]
if not addr.endswith("/"):
addr += "/"
# objects = addr + "/" + "objects.inv"
# if requests.head(objects).ok:
# logger.error("Cannot find {objects}...")
# return None
return addr
def _reorder_versions(vdict: dict[str, str]) -> dict[str, str]:
"""Re-orders version dictionary by decreasing version."""
# nota bene: new dicts preserve insertion order
retval: dict[str, str] = {}
# these keys come always first, if available
protected = ("latest", "main", "master", "stable")
for key in protected:
if key in vdict:
retval[key] = vdict[key]
# next, are releases in reverse order
version_map = {
packaging.version.Version(k): k
for k in vdict.keys()
if (k not in protected) and PEP440_RE.match(k)
}
for version in sorted(version_map.keys(), reverse=True):
retval[version_map[version]] = vdict[version_map[version]]
# now, everything else
retval.update({k: v for k, v in vdict.items() if k not in retval})
return retval
[docs]
def docurls_from_environment(package: str) -> dict[str, str]:
"""Checks installed package metadata for documentation URLs.
Arguments:
package: Name of the package you want to check
version: A version such as "stable", "latest" or a formal version
number parsed by :py:class:`packaging.version.Version`.
Returns:
A dictionary, that maps the version of the documentation found on PyPI
to the URL.
"""
try:
md = importlib.metadata.metadata(package)
if md.get_all("Project-URL") is None:
return {}
for k in md.get_all("Project-URL"):
if k.startswith(("documentation, ", "Documentation, ")):
addr = _ensure_webdir(k.split(",", 1)[1].strip())
if requests.head(addr + "/objects.inv").ok:
try:
return {md["version"]: addr}
except KeyError:
return {"latest": addr}
except importlib.metadata.PackageNotFoundError:
pass
return {}
[docs]
def docurls_from_rtd(package: str) -> dict[str, str]:
"""Checks readthedocs.org for documentation pointers for the package.
Arguments:
package: Name of the package to check on rtd.org - this must be the
name it is know at rtd.org and not necessarily the package name.
Some packages do have different names on rtd.org.
Returns:
A dictionary, which contains all versions of documentation available
for the given package on RTD. If the package's documentation is not
available on RTD, returns an empty dictionary.
"""
try:
url = f"https://readthedocs.org/projects/{package}/versions/"
logger.debug(f"Reaching for `{url}'...")
r = requests.get(f"https://readthedocs.org/projects/{package}/versions/")
if r.ok:
tree = lxml.html.fromstring(r.text)
return {
k.text: _ensure_webdir(k.attrib["href"])
for k in tree.xpath("//a[contains(@class, 'module-item-title')]")
if k.attrib["href"].startswith("http")
}
except requests.exceptions.RequestException:
pass
return {}
def _get_json(url: str) -> dict | None:
try:
logger.debug(f"Reaching for `{url}'...")
r = requests.get(url)
if r.ok:
return r.json()
except requests.exceptions.RequestException:
pass
return None
[docs]
def docurls_from_pypi(package: str, max_entries: int) -> dict[str, str]:
"""Checks PyPI for documentation pointers for a given package.
This procedure first looks up the main repo JSON entry, and then figures
out all available versions of the package. In a second step, and depending
on the value of ``max_entries``, this function will retrieve the latest
``max_entries`` available on that particular package.
Arguments:
package: Name of the PyPI package you want to check
max_entries: The maximum number of entries to lookup in PyPI. A value
of zero will download only the main package information and will
hit PyPI only once. A value bigger than zero will download at most
the information from the last ``max_entries`` releases. Finally, a
negative value will imply the download of all available releases.
Returns:
A dictionary, that maps the version of the documentation found on PyPI
to the URL.
"""
versions: dict[str, str] = {}
data = _get_json(f"https://pypi.org/pypi/{package}/json")
if data is None:
return versions
urls = data["info"]["project_urls"]
addr = urls.get("Documentation") or urls.get("documentation")
if addr is not None:
addr = _ensure_webdir(addr)
if requests.head(addr + "/objects.inv").ok:
versions[data["info"]["version"]] = addr
# download further versions, if requested by user
version_map = {
packaging.version.Version(k): k
for k in data["releases"].keys()
if PEP440_RE.match(k)
}
versions_to_probe = sorted(list(version_map.keys()), reverse=True)
if max_entries >= 0:
versions_to_probe = versions_to_probe[:max_entries]
for k in versions_to_probe:
data = _get_json(f"https://pypi.org/pypi/{package}/{version_map[k]}/json")
if data is None:
continue
urls = data["info"]["project_urls"]
addr = urls.get("Documentation") or urls.get("documentation")
if addr is not None:
addr = _ensure_webdir(addr)
if requests.head(addr + "/objects.inv").ok:
versions[data["info"]["version"]] = addr
return versions
[docs]
class Catalog(collections.abc.MutableMapping):
"""A type that can lookup and store information about Sphinx documents.
The object is organised as a dictionary (mutable mapping type) with extra
methods to handle information update from various sources. Information is
organised as dictionary mapping Python package names to another dictionary
containing the following entries:
* ``versions``: A dictionary mapping version numbers to URLs. The keys
have free form, albeit are mostly PEP440 version numbers. Keywords such
as ``stable``, ``latest``, ``master``, or ``main`` are typically found as
well.
* ``sources``: A dictionary mapping information sources for this particular
entry. Keys are one of ``pypi``, ``readthedocs`` or ``environment``.
Values correspond to specific names used for the lookup of the
information on those sources.
Attributes:
_data: Internal dictionary containing the mapping between package names
the user can refer to, versions and eventual sources of such
information.
"""
_data: dict[str, PackageDictionaryType]
def __init__(self) -> None:
self.reset()
[docs]
def load(self, path: pathlib.Path) -> None:
"""Loads and replaces contents with those from the file."""
with path.open("rt") as f:
logger.debug(f"Loading package catalog from {str(path)}...")
self._data = json.load(f)
logger.debug(f"Loaded {len(self)} entries from {str(path)}")
[docs]
def loads(self, contents: str) -> None:
"""Loads and replaces contents with those from the string."""
self._data = json.loads(contents)
logger.debug(f"Loaded {len(self)} entries from string")
[docs]
def dump(self, path: pathlib.Path) -> None:
"""Loads and replaces contents with those from the file."""
if path.exists():
backup = path.with_suffix(path.suffix + "~")
logger.debug(f"Backing up: {str(path)} -> {str(backup)}...")
shutil.copy(path, backup) # backup
with path.open("wt") as f:
logger.debug(
f"Saving package catalog with {len(self)} entries at {str(path)}..."
)
json.dump(self._data, f, indent=2)
f.write("\n") # avoids pre-commit/self-update conflicting changes
[docs]
def dumps(self) -> str:
"""Loads and replaces contents with those from the string."""
return json.dumps(self._data, indent=2)
[docs]
def reset(self) -> None:
"""Full resets internal catalog."""
self._data = {}
# mutable mapping operations, so this looks like a dictionary
def __getitem__(self, key: str) -> PackageDictionaryType:
return self._data[key]
def __setitem__(self, key: str, value: PackageDictionaryType) -> None:
self._data[key] = value
def __delitem__(self, key: str) -> None:
del self._data[key]
def __len__(self) -> int:
return len(self._data)
def __iter__(self) -> typing.Iterator[str]:
return iter(self._data)
def __repr__(self) -> str:
return repr(self._data)
def _ensure_defaults(self, pkg: str) -> None:
"""Ensures a standardised setup for a package entry."""
self.setdefault(pkg, {"versions": {}, "sources": {}})
self[pkg].setdefault("versions", {})
self[pkg].setdefault("sources", {})
[docs]
def update_versions_from_environment(self, pkg: str, name: str | None) -> bool:
"""Replaces package documentation URLs using information from current
Python environment.
Arguments:
pkg: Name of the package as one would find in pypi.org. This
name can be different then that of the Python package
itself.
name: This is the name of the package as installed on the current
environment. Sometimes, this name can be different then that of
the Python package itself. If this value is set to ``None``,
then we just use ``pkg`` as the name to lookup.
Returns:
``True``, if the update was successful (found versions), or
``False``, otherwise.
"""
self._ensure_defaults(pkg)
name = name or pkg
logger.debug(f"{pkg}: checking current Python environment for {name}...")
versions = docurls_from_environment(name)
logger.debug(
f"{pkg}: Found {len(versions)} doc URL(s) at current Python environment"
)
if versions:
self[pkg]["versions"].update(versions)
self[pkg]["versions"] = _reorder_versions(self[pkg]["versions"])
self[pkg]["sources"]["environment"] = name
return len(versions) > 0
[docs]
def update_versions_from_rtd(self, pkg: str, name: str | None) -> bool:
"""Replaces package documentation URLs using information from
readthedocs.org.
Arguments:
pkg: Name of the Python package to update versions for.
name: This is the name of the package on readthedocs.org. Often,
this name is different then that of the Python package itself.
If this value is set to ``None``, then we just use ``pkg`` as
the name to lookup.
Returns:
The dictionary of values for the current package, as obtained from
readthedocs.org, and potentially merged with the existing one.
"""
self._ensure_defaults(pkg)
name = name or pkg
logger.debug(f"{pkg}: checking readthedocs.org for {name}...")
versions = docurls_from_rtd(name)
logger.debug(f"{pkg}: Found {len(versions)} doc URL(s) at readthedocs.org")
if versions:
self[pkg]["versions"].update(versions)
self[pkg]["versions"] = _reorder_versions(self[pkg]["versions"])
self[pkg]["sources"]["readthedocs"] = name
return len(versions) > 0
[docs]
def update_versions_from_pypi(
self, pkg: str, name: str | None, max_entries: int
) -> bool:
"""Replaces package documentation URLs using information from pypi.org.
Arguments:
pkg: Name of the package as one would find in pypi.org. This
name can be different then that of the Python package
itself.
name: This is the name of the package on pypi.org. Sometimes, this
name can be different then that of the Python package itself.
If this value is set to ``None``, then we just use ``pkg`` as
the name to lookup.
max_entries: The maximum number of entries to lookup in PyPI. A
value of zero will download only the main package information
and will hit PyPI only once. A value bigger than zero will
download at most the information from the last ``max_entries``
releases. Finally, a negative value will imply the download of
all available releases.
Returns:
The dictionary of values for the current package, as obtained from
pypi.org, and potentially merged with the existing one.
"""
self._ensure_defaults(pkg)
name = name or pkg
logger.debug(f"{pkg}: checking pypi.org for {name}...")
versions = docurls_from_pypi(name, max_entries)
logger.debug(f"{pkg}: Found {len(versions)} doc URL(s) at pypi.org")
if versions:
self[pkg]["versions"].update(versions)
self[pkg]["versions"] = _reorder_versions(self[pkg]["versions"])
self[pkg]["sources"]["pypi"] = name
return len(versions) > 0
[docs]
def update_versions(
self,
pkgs: typing.Iterable[str],
order: typing.Iterable[str] = ["environment", "readthedocs", "pypi"],
names: dict[str, dict[str, str]] = {},
pypi_max_entries: int = 0,
keep_going: bool = False,
) -> None:
"""Updates versions for a list of packages in this catalog.
This method will add a list of packages defined by ``pkgs`` (list of
names) into its own catalog. The order of look-ups by default is set
by the ``order``, and it is the following:
1. Current Python environment (``environment``)
2. readthedocs.org (``readthedocs``)
3. PyPI (``pypi``)
Arguments:
pkgs: List of packages that will have their versions updated
order: A list, containing the order in which lookup will happen.
There are only 3 possible keys that can be used here:
``environment``, which stands for finding package metadata from
the currently installed Python environment, ``readthedocs``,
which will trigger readthedocs.org lookups, and ``pypi``, which
will trigger pypi.org lookups from uploaded packages.
names: A dictionary, that eventually maps source names (as in
``order``) to another dictionary that maps package names to to
their supposed names on readthedocs.org, pypi.org or the current
environment. If keys for various packages are not available, then
their package names are used. If the keys exist, but are set
to ``None``, then lookup for that particular source is skipped.
pypi_max_entries: The maximum number of entries to lookup in PyPI.
A value of zero will download only the main package information
and will hit PyPI only once. A value bigger than zero will
download at most the information from the last ``max_entries``
releases. Finally, a negative value will imply the download of
all available releases.
keep_going: By default, the method stops adding a package when a
hit is found (in either of these sources of information). If
the flag ``keep_going`` is set to ``True`` (defaults to
``False``), then it merges information from all sources. Note
that some of this information may be repetitive.
"""
for pkg in pkgs:
for action in order:
if action == "environment":
name = names.get(action, {}).get(pkg, pkg)
if name is not None:
ok = self.update_versions_from_environment(pkg, name)
if ok and not keep_going:
break
elif action == "readthedocs":
name = names.get(action, {}).get(pkg, pkg)
if name is not None:
ok = self.update_versions_from_rtd(pkg, name)
if ok and not keep_going:
break
elif action == "pypi":
name = names.get(action, {}).get(pkg, pkg)
if name is not None:
ok = self.update_versions_from_pypi(pkg, name, pypi_max_entries)
if ok and not keep_going:
break
else:
raise RuntimeError(f"Unrecognized source: {action}")
[docs]
def self_update(self) -> None:
"""Runs a self-update procedure, by re-looking up known sources."""
# organises the names as expected by update_versions()
names: dict[str, dict[str, str]] = dict(environment={}, readthedocs={}, pypi={})
for pkg, info in self.items():
for src in ("environment", "readthedocs", "pypi"):
names[src][pkg] = info["sources"].get(src)
self.update_versions(pkgs=self.keys(), names=names)
def _string2version(v: str) -> packaging.version.Version | None:
"""Converts a string into a version number.
This method covers various specific use-cases:
* ``1.2.3`` -> specific version
* ``1.2.x``, ``1.2`` -> anything in the ``[1.2.0, 1.3.0)`` range
* ``1.x.x``, ``1`` -> anything in the ``[1.0.0, 2.0.0)`` range
* anything else: discarded
Arguments:
v: a string containing the version number to be parsed, like the ones
in the catalog
Returns:
Either ``None``, or the version object with the parsed version.
"""
v = v.replace(".x", "")
try:
return packaging.version.Version(v)
except packaging.version.InvalidVersion:
return None
def _prepare_versions(versions: dict[str, str]) -> dict[str, str]:
"""Prepares a dictionary of versions for structured lookups.
This procedure:
1. Ensures there is one ``latest`` and ``stable`` entries in the input
dictionary
2. Augment the version dictionary with PEP-440 version numbers (e.g.
annotates ``v2.2.0`` -> ``2.2.0``, or ``1.x`` -> ``1``)
Arguments:
versions: A dictionary that maps release version (and aliases such as
``stable`` or ``latest`` to URLs that contain Sphinx-generated
documentation.
Returns:
A dictionary with keys that correspond to parsed versions and aliases.
"""
if not versions:
return versions
# see what each valid number means
version_map = {_string2version(k): k for k in versions.keys()}
sorted_versions = sorted([k for k in version_map.keys() if k is not None])
retval: dict[str, str] = {}
if sorted_versions:
# there is at least 1 (valid) version number
latest = sorted_versions[-1]
retval["latest"] = versions.get("latest", versions[version_map[latest]])
stable_versions = [
k for k in sorted_versions if not (k.is_prerelease or k.is_devrelease)
]
if stable_versions:
stable = stable_versions[-1]
else:
stable = latest
retval["stable"] = versions.get("stable", versions[version_map[stable]])
# fill-in the remainder of the versions, leave latest on top
for k in reversed(sorted_versions):
retval[version_map[k]] = versions[version_map[k]]
if ".x" in version_map[k]:
# copy to a shortened version number as well
retval[version_map[k].replace(".x", "")] = versions[version_map[k]]
elif k.public != version_map[k]:
# copy a standardised version number as well
retval[k.public] = versions[version_map[k]]
else:
# there is either nothing, or just aliases such as stable/latest
retval["latest"] = (
versions.get("latest")
or versions.get("stable")
or versions.get("master")
or versions.get("main")
or ""
)
retval["stable"] = (
versions.get("stable")
or versions.get("latest")
or versions.get("master")
or versions.get("main")
or ""
)
return retval
[docs]
class LookupCatalog:
"""A catalog that guarantees standardised version lookups.
Arguments:
catalog: The catalog to use as base for the lookup.
"""
def __init__(self, catalog: Catalog):
self._catalog = catalog
self.reset()
[docs]
def reset(self):
"""Internally creates all possible aliases for package names and
versions.
This method will expand the catalog package names and version
numbers so that the user can refer to these using environment,
readthedocs.org or pypi.org names for packages, and PEP-440
compatible strings for version names during the lookup.
The catalog associated to this lookup is not modified in this
process. All augmentations are built-into the object instance.
"""
self._version_map: dict[str, dict[str, str]] = {}
self._package_map: dict[str, str] = {}
for pkg in self._catalog.keys():
self._version_map[pkg] = _prepare_versions(self._catalog[pkg]["versions"])
# translations from Python, rtd.org or pypi.org names
self._package_map[pkg] = pkg
self._package_map.update(
{v: pkg for v in self._catalog[pkg]["sources"].values()}
)
[docs]
def get(self, pkg: str, version: str | None, default: typing.Any = None):
"""Accesses one single ``pkg/version`` documentation URL.
Arguments:
pkg: The package name, as available on the catalog or through one
of its environment, readthedocs.org or pypi.org names.
version: The version of the package to search for. This must be
either an identifier from readthedocs.org or pypi.org, or a valid
PEP-440 version number as a string.
default: The default value to return in case we do not find a
match.
Returns:
If a match is found, returns the URL for the documentation.
Otherwise, returns the ``default`` value.
"""
if pkg not in self._package_map:
return default
if version not in self._version_map[pkg]:
return default
return self._version_map[self._package_map[pkg]][version]