Source code for jangle.readers

from __future__ import annotations

import csv
import io
import os
import warnings
from typing import Generator, Iterable, Optional, TypeVar
from zipfile import ZipFile

import requests
from requests.compat import urljoin
from requests.models import ITER_CHUNK_SIZE

T = TypeVar("T")

IANA_ASSIGNMENTS_URL = "https://www.iana.org/assignments/"
SIL_ISO_639_DOWNLOADS_URL = (
    "https://iso639-3.sil.org/sites/iso639-3/files/downloads/"
)
SIL_ISO_639_LATEST = "20220311"
SIL_ISO_639_ZIPFILE = f"iso-639-3_Code_Tables_{SIL_ISO_639_LATEST}"


[docs]class Record(dict[str, list[str]]):
    """Used for working with record-jar records."""

[docs]    def add(self, key: str, val: str) -> None:
        """Adds a value to a field."""
        if key in self:
            self[key].append(val)
        else:
            self[key] = [val]

[docs]    def one(self, key: str) -> str:
        """Return a single value from a field.

        Raises
        ------
        ValueError
            If the field has multiple values.
        KeyError
            If the field has no values.
        """
        vals = self[key]
        if len(vals) > 1:
            raise ValueError(f"key '{key}' has multiple values {vals}")
        if not vals:
            raise KeyError(f"key '{key}' has an empty list of values")
        return vals[0]

[docs]    def get_one(self, key: str, default: T = None) -> str | T:
        """Return a single value from a field, or `default`.

        Raises
        ------
        ValueError
            If the field has multiple values.
        """
        try:
            return self.one(key)
        except KeyError:
            return default


[docs]def parse_record_jar(
    lines: Iterable[str], indent="\t", multiline_separator="\r\n"
) -> Generator[Record, None, None]:
    """Yields records from a set of lines.
    See https://datatracker.ietf.org/doc/pdf/draft-phillips-record-jar-02.
    """
    record = Record()
    key = None
    for line in lines:
        line_text = line.strip()
        if not line_text:
            continue
        if line.startswith(indent):
            if key is None:
                continue
            record[key][-1] += multiline_separator + line_text
        elif line_text == "%%":
            yield record
            record = Record()
        elif ":" in line:
            key, val = line_text.split(":", 1)
            record.add(key.strip(), val.strip())
    yield record


[docs]class SilTableReader:
    """Uses `csv.DictReader` to read tab-delimited data
    from https://iso639-3.sil.org/sites/iso639-3/files/downloads/,
    or a ZipFile to minimize requests.

    More information is available at
    https://iso639-3.sil.org/code_tables/download_tables.
    """

    chunk_size = ITER_CHUNK_SIZE

    def __init__(self, fn: str, zf: Optional[ZipFile] = None) -> None:
        self._f = None
        if zf:
            try:
                self._f = zf.open(
                    os.path.join(
                        SIL_ISO_639_ZIPFILE,
                        f"{fn}_{SIL_ISO_639_LATEST}.tab",
                    ),
                    "r",
                )
            except Exception as exc:  # TODO
                warnings.warn(str(exc))
        if self._f is None:
            self._r = requests.get(
                urljoin(SIL_ISO_639_DOWNLOADS_URL, fn + ".tab"), stream=True
            )
            self._r.raise_for_status()
            self._r.encoding = "utf-8"

    def __iter__(self) -> csv.DictReader:
        if self._f:
            it = io.TextIOWrapper(self._f).readlines()
        else:
            it = self._r.iter_lines(self.chunk_size, decode_unicode=True)
        return csv.DictReader(it, dialect="excel-tab")

[docs]    def close(self) -> None:
        if self._f:
            self._f.close()
        elif self._r:
            self._r.close()

    def __enter__(self) -> SilTableReader:
        return self

    def __exit__(self, exc_type, exc_value, traceback) -> None:
        self.close()


[docs]class IANARegistryReader:
    """Provides a file date and an iterable of records from an IANA registry."""

    chunk_size = ITER_CHUNK_SIZE
    records: Generator[Record, None, None]
    file_date: str

    def __init__(self, fn: str) -> None:
        response = requests.get(
            urljoin(IANA_ASSIGNMENTS_URL, "/".join([fn, fn])),
            stream=True,
        )
        response.raise_for_status()
        self.records = parse_record_jar(
            response.iter_lines(self.chunk_size, decode_unicode=True),
            indent=" " * 2,
            multiline_separator=" ",
        )
        self.file_date = next(self.records).one("File-Date")


[docs]class IANASubtagRegistryReader(IANARegistryReader):
    """Reads https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry."""

    def __init__(self) -> None:
        super().__init__("language-subtag-registry")


[docs]class IANAExtensionsRegistryReader(IANARegistryReader):
    def __init__(self) -> None:
        super().__init__("language-extensions-registry")
Source code for jangle.readers

jangle

Navigation

Related Topics