Source code for jangle.readers
from __future__ import annotations
import csv
import io
import os
import warnings
from typing import Generator, Iterable, Optional, TypeVar
from zipfile import ZipFile
import requests
from requests.compat import urljoin
from requests.models import ITER_CHUNK_SIZE
T = TypeVar("T")
IANA_ASSIGNMENTS_URL = "https://www.iana.org/assignments/"
SIL_ISO_639_DOWNLOADS_URL = (
"https://iso639-3.sil.org/sites/iso639-3/files/downloads/"
)
SIL_ISO_639_LATEST = "20220311"
SIL_ISO_639_ZIPFILE = f"iso-639-3_Code_Tables_{SIL_ISO_639_LATEST}"
[docs]class Record(dict[str, list[str]]):
"""Used for working with record-jar records."""
[docs] def add(self, key: str, val: str) -> None:
"""Adds a value to a field."""
if key in self:
self[key].append(val)
else:
self[key] = [val]
[docs] def one(self, key: str) -> str:
"""Return a single value from a field.
Raises
------
ValueError
If the field has multiple values.
KeyError
If the field has no values.
"""
vals = self[key]
if len(vals) > 1:
raise ValueError(f"key '{key}' has multiple values {vals}")
if not vals:
raise KeyError(f"key '{key}' has an empty list of values")
return vals[0]
[docs] def get_one(self, key: str, default: T = None) -> str | T:
"""Return a single value from a field, or `default`.
Raises
------
ValueError
If the field has multiple values.
"""
try:
return self.one(key)
except KeyError:
return default
[docs]def parse_record_jar(
lines: Iterable[str], indent="\t", multiline_separator="\r\n"
) -> Generator[Record, None, None]:
"""Yields records from a set of lines.
See https://datatracker.ietf.org/doc/pdf/draft-phillips-record-jar-02.
"""
record = Record()
key = None
for line in lines:
line_text = line.strip()
if not line_text:
continue
if line.startswith(indent):
if key is None:
continue
record[key][-1] += multiline_separator + line_text
elif line_text == "%%":
yield record
record = Record()
elif ":" in line:
key, val = line_text.split(":", 1)
record.add(key.strip(), val.strip())
yield record
[docs]class SilTableReader:
"""Uses `csv.DictReader` to read tab-delimited data
from https://iso639-3.sil.org/sites/iso639-3/files/downloads/,
or a ZipFile to minimize requests.
More information is available at
https://iso639-3.sil.org/code_tables/download_tables.
"""
chunk_size = ITER_CHUNK_SIZE
def __init__(self, fn: str, zf: Optional[ZipFile] = None) -> None:
self._f = None
if zf:
try:
self._f = zf.open(
os.path.join(
SIL_ISO_639_ZIPFILE,
f"{fn}_{SIL_ISO_639_LATEST}.tab",
),
"r",
)
except Exception as exc: # TODO
warnings.warn(str(exc))
if self._f is None:
self._r = requests.get(
urljoin(SIL_ISO_639_DOWNLOADS_URL, fn + ".tab"), stream=True
)
self._r.raise_for_status()
self._r.encoding = "utf-8"
def __iter__(self) -> csv.DictReader:
if self._f:
it = io.TextIOWrapper(self._f).readlines()
else:
it = self._r.iter_lines(self.chunk_size, decode_unicode=True)
return csv.DictReader(it, dialect="excel-tab")
[docs] def close(self) -> None:
if self._f:
self._f.close()
elif self._r:
self._r.close()
def __enter__(self) -> SilTableReader:
return self
def __exit__(self, exc_type, exc_value, traceback) -> None:
self.close()
[docs]class IANARegistryReader:
"""Provides a file date and an iterable of records from an IANA registry."""
chunk_size = ITER_CHUNK_SIZE
records: Generator[Record, None, None]
file_date: str
def __init__(self, fn: str) -> None:
response = requests.get(
urljoin(IANA_ASSIGNMENTS_URL, "/".join([fn, fn])),
stream=True,
)
response.raise_for_status()
self.records = parse_record_jar(
response.iter_lines(self.chunk_size, decode_unicode=True),
indent=" " * 2,
multiline_separator=" ",
)
self.file_date = next(self.records).one("File-Date")
[docs]class IANASubtagRegistryReader(IANARegistryReader):
"""Reads https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry."""
def __init__(self) -> None:
super().__init__("language-subtag-registry")
[docs]class IANAExtensionsRegistryReader(IANARegistryReader):
def __init__(self) -> None:
super().__init__("language-extensions-registry")