"""
THIS LOG SHIPPER MUST ONLY SHIP LOGS.

It is not allowed to depend on any libraries other than python standard libraries, including our own
determined library.  The reason is that python virtual env errors in user containers are common
scenarios for users to encounter, and we must be able to still ship logs in those scenarios.

The only thing that is allowed to break the log shipper is a misconfigured cluster (if the log
shipper isn't able to connect to the master) or if python isn't installed.

But if you are thinking of making this log shipper do anything other than ship logs, or if you are
thinking of adding any dependencies not inside the standard library, stop.  Let the log shipper
just ship logs; it's too important of a job to mix it with anything else.

---

ship_logs.py: a suitable container entrypoint that ships logs from a child process to the master.

usage: ship_logs.py CMD ARGS...

ship_logs.py will read environment variables set by the master to obtain its configuration.  It
isn't intended to be useful in any non-managed environments.
"""

import datetime
import io
import json
import logging
import os
import queue
import re
import signal
import socket
import ssl
import subprocess
import sys
import threading
import time
import traceback
import urllib.error
import urllib.request
from typing import Any, Dict, Iterator, List, NamedTuple, Optional, cast

# Duplicated from determined/__init__.py.  It's nice to keep them in sync.
LOG_FORMAT = "%(levelname)s: [%(process)s] %(name)s: %(message)s"


# Example log message given below.
# 2022-05-12 16:32:48,757:gc_checkpoints: [rank=0] INFO: Determined checkpoint GC, ...
# Below regex is used to extract the rank field from the log message.
# Excluding empty spaces this regex matches rank in the above example as [rank=0]
# Using the DOTALL flag means we keep the newline at the end of the pattern.
rank = re.compile(
    r"(?P<space1> ?)\[rank=(?P<rank_id>([0-9]+))\](?P<space2> ?)(?P<log>.*)", flags=re.DOTALL
)
# Below regex is used to extract the message severity from the log message.
# Excluding empty spaces and delimiter(:) this regex matches message severity level in the above
# example as INFO.
# Using the DOTALL flag means we keep the newline at the end of the pattern.
level = re.compile(
    r"(?P<space1> ?)(?P<level>(DEBUG|INFO|WARNING|ERROR|CRITICAL)):(?P<space2> ?)(?P<log>.*)",
    flags=re.DOTALL,
)
lineend = re.compile(rb"[\r\n]")


# Interval at which to force a flush.
SHIPPER_FLUSH_INTERVAL = 1  # How often to make API calls

# Full jitter time on encountering an API exception.
SHIPPER_FAILURE_BACKOFF_SECONDS = 1

# Max size of the log buffer before forcing a flush.
LOG_BATCH_MAX_SIZE = 1000

# Max size of the shipping queue before we start to apply backpressure by blocking sends. We would
# only hit this if we got underwater by three full batches while trying to ship a batch.
SHIP_QUEUE_MAX_SIZE = 3 * LOG_BATCH_MAX_SIZE


class DoneMsg(NamedTuple):
    """
    DoneMsg is what each thread of exectuion puts on the doneq when it finishes.
    """

    who: str
    error: Optional[Exception] = None
    exit_code: Optional[int] = None


# Duplicated in wrap_rank.py.  If you find a bug here, fix it there too.
def read_newlines_or_carriage_returns(fd: io.RawIOBase) -> Iterator[str]:
    r"""
    Read lines, delineated by either '\n' or '\r.

    Unlike the default io.BufferedReader used in subprocess.Popen(bufsize=-1), we read until we
    encounter either '\n' or \r', and treat that as one line.

    Specifically, io.BufferedReader doesn't handle tqdm progress bar outputs very well; it treats
    all of the '\r' outputs as one enormous line.

    Args:
        fd: an unbuffered stdout or stderr from a subprocess.Popen.

    Yields:
        A series of str, one per line.  Each line always ends with a '\n'.  Each line will be
        broken to length io.DEFAULT_BUFFER_SIZE, even if the underlying io didn't have a linebreak.
    """
    # Ship lines of length of DEFAULT_BUFFER_SIZE, including the terminating newline.
    limit = io.DEFAULT_BUFFER_SIZE - 1
    nread = 0
    chunks: List[bytes] = []

    def oneline():
        nonlocal nread
        nonlocal chunks
        out = b"".join(chunks).decode("utf8")
        chunks = []
        nread = 0
        return out

    while True:
        buf = fd.read(limit - nread)
        if not buf:
            # EOF.
            break

        # Extract all the lines from this buffer.
        while buf:
            m = lineend.search(buf)
            if m is None:
                # No line break here; just append to chunks.
                chunks.append(buf)
                nread += len(buf)
                break

            # Line break found!
            start, end = m.span()
            chunks.append(buf[:start])
            # Even if we matched a '\r', emit a '\n'.
            chunks.append(b"\n")
            yield oneline()
            # keep checking the rest of buf
            buf = buf[end:]

        # Detect if we reached our buffer limit.
        if nread >= limit:
            # Pretend we got a line anyway.
            chunks.append(b"\n")
            yield oneline()

    # One last line, maybe.
    if chunks:
        chunks.append(b"\n")
        yield oneline()


class Collector(threading.Thread):
    """
    Collector is the thread that reads and parses lines from stdout or stderr.

    It will pass structured data to the logq, and will send a message on doneq when it finishes.
    """

    def __init__(
        self,
        fd: io.RawIOBase,
        stdtype: str,
        emit_stdout_logs: bool,
        metadata: Dict[str, Any],
        logq: queue.Queue,
        doneq: queue.Queue,
        daemon: bool,
    ) -> None:
        super().__init__(daemon=daemon)
        self.fd = fd
        self.stdtype = stdtype
        self.metadata = {"stdtype": self.stdtype, **metadata}
        self.logq = logq
        self.doneq = doneq

        if not emit_stdout_logs:
            self.dup_io = None
        else:
            self.dup_io = sys.stdout if stdtype == "stdout" else sys.stderr

    def run(self) -> None:
        try:
            self._run()
        except Exception as e:
            self.doneq.put(DoneMsg(self.stdtype, error=e))
        else:
            self.doneq.put(DoneMsg(self.stdtype, error=None))
        finally:
            self.logq.put(None)

    def _run(self) -> None:
        for line in read_newlines_or_carriage_returns(self.fd):
            # Capture the timestamp as soon as the line is collected.
            now = datetime.datetime.now(datetime.timezone.utc).isoformat()

            if self.dup_io:
                print(line, file=self.dup_io, flush=True, end="")

            log: Dict[str, Any] = {"timestamp": now, **self.metadata}

            m = rank.match(line)
            if m:
                try:
                    log["rank_id"] = int(m.group("rank_id"))
                    line = m.group("log")
                except ValueError:
                    pass

            m = level.match(line)
            if m:
                found = m.group("level")
                log["level"] = f"LOG_LEVEL_{found}"
                line = m.group("log")

            log["log"] = line

            self.logq.put(log)


def override_verify_name(ctx: ssl.SSLContext, verify_name: str) -> ssl.SSLContext:
    class VerifyNameOverride:
        def __getattr__(self, name: str, default: Any = None) -> Any:
            return getattr(ctx, name, default)

        def wrap_socket(self, *args, server_hostname=None, **kwargs) -> Any:
            kwargs["server_hostname"] = verify_name
            return ctx.wrap_socket(*args, **kwargs)

    return cast(ssl.SSLContext, VerifyNameOverride())


class Shipper(threading.Thread):
    """
    Shipper reads structured logs from logq and ships them to the determined-master.

    It will send a message on doneq when it finishes.
    """

    def __init__(
        self,
        master_url: str,
        token: str,
        cert_name: str,
        cert_file: str,
        logq: queue.Queue,
        doneq: queue.Queue,
        daemon: bool,
    ) -> None:
        super().__init__(daemon=daemon)
        self.logq = logq
        self.doneq = doneq

        # TODO(rb): Switch to DET_USER_TOKEN when the user token passed into a container isn't
        # limited to expire in 7 days, and then set `Authorization: Bearer $token` here instead.
        self.headers = {"Grpc-Metadata-x-allocation-token": f"Bearer {token}"}

        self.master_url = master_url
        self.cert_name = cert_name
        self.noverify = cert_file.lower() == "noverify"
        self.cert_content = None
        if cert_file and not self.noverify:
            try:
                with open(cert_file, "r") as f:
                    self.cert_content = f.read()
            except Exception:
                logging.error("failed to read DET_MASTER_CERT_FILE ({cert_file})", exc_info=True)

        self.base_url = master_url.rstrip("/")
        self.logs_url = f"{self.base_url}/api/v1/task/logs"

        self.context = None
        if master_url.startswith("https://"):
            # Create an SSLContext that trusts our DET_MASTER_CERT_FILE, and checks the hostname
            # against the DET_MASTER_CERT_NAME (which may differ from the hostname in the url).
            self.context = ssl.create_default_context()
            if self.noverify:
                # Don't check the master's certificate.
                # Presently the master never sets this value for DET_MASTER_CERT_FILE, but we keep
                # this check to be consistent with the CLI behavior.
                self.context.check_hostname = False
                self.context.verify_mode = ssl.CERT_NONE
            else:
                if cert_file:
                    # Explicitly trust the cert in cert_file.
                    self.context.load_verify_locations(cadata=self.cert_content)
                if cert_name:
                    # Override hostname verification
                    self.context = override_verify_name(self.context, cert_name)

    def run(self) -> None:
        try:
            self._run()
        except Exception as e:
            self.doneq.put(DoneMsg("shipper", error=e))
        else:
            self.doneq.put(DoneMsg("shipper", error=None))

    def _run(self) -> None:
        eofs = 0
        while eofs < 2:
            logs: List[Dict[str, Any]] = []
            deadline = time.time() + SHIPPER_FLUSH_INTERVAL
            # Pop logs until both collectors close, or we fill up a batch, or we hit the deadline.
            while eofs < 2 and len(logs) < LOG_BATCH_MAX_SIZE:
                now = time.time()
                timeout = deadline - now
                if timeout <= 0:
                    # We are already passed the deadline.
                    break

                try:
                    log = self.logq.get(timeout=timeout)
                except queue.Empty:
                    # We hit the timeout.
                    break

                if log is None:
                    eofs += 1
                    continue

                logs.append(log)

            if not logs:
                continue

            data = json.dumps({"logs": logs}).encode("utf8")

            # Try to ship for about ten minutes.
            backoffs = [0, 1, 5, 10, 15, 15, 15, 15, 15, 15, 15, 60, 60, 60, 60, 60, 60, 60, 60, 60]

            self.ship(data, backoffs)

    def ship(self, data: bytes, backoffs: List[int]) -> None:
        for delay in backoffs:
            time.sleep(delay)
            try:
                req = urllib.request.Request(self.logs_url, data, self.headers, method="POST")
                try:
                    with urllib.request.urlopen(req, context=self.context) as resp:
                        respbody = resp.read()
                except urllib.error.URLError as e:
                    # urllib stacktraces are awful, so see if we can interpret what happened.
                    # Note that we've already connected successfully to the master so failures here
                    # are likely related to master crashing or the network breaking or something to
                    # that effect.
                    if isinstance(e, urllib.error.HTTPError):
                        raise RuntimeError(
                            f"POST logs returned status code: {e.code} and reason: {e.reason}, "
                            "is the master healthy?"
                        ) from None
                    elif isinstance(e.reason, ConnectionRefusedError):
                        raise RuntimeError(
                            f"The connection to {self.master_url} was refused, is master down?"
                        )
                    raise

                if resp.getcode() != 200:
                    raise RuntimeError(
                        f"ship logs failed: status code {resp.get_code()}, body:\n---\n"
                        + respbody.decode("utf8")
                    )

                # Shipped successfully
                return

            except Exception:
                logging.error("failed to ship logs to master", exc_info=True)

        raise RuntimeError("failed to connect to master for too long, giving up")

    def ship_special(self, msg: str, metadata: Dict[str, str], emit_stdout_logs: bool) -> None:
        """
        Ship a special message, probably from failing to start the child process.
        """
        now = datetime.datetime.now(datetime.timezone.utc).isoformat()
        logs = []
        # Build a json log line out of each message line.
        if not msg.endswith("\n"):
            msg += "\n"
        for line in msg.splitlines(keepends=True):
            if emit_stdout_logs:
                print(line, end="", file=sys.stderr)
            logs.append(
                {
                    "timestamp": now,
                    "log": line,
                    "level": "ERROR",
                    "stdtype": "stderr",
                    **metadata,
                }
            )

        data = json.dumps({"logs": logs}).encode("utf8")

        # Try to ship for about 30 seconds.
        backoffs = [0, 1, 5, 10, 15]
        self.ship(data, backoffs)

    def assert_master_is_reachable(self):
        """
        Before we start actually shipping logs, try to confirm that we can reach the master at all.

        If we can't, log an overly-detailed explanation to help sysadmins debug their cluster.

        Unfortunately, if we can't reach the master, they'll have to use the /ship_logs escape hatch
        to see this message, or be on slurm where we pull slurm's stderr into task logs
        automatically.
        """
        backoffs = [0, 1, 5, 10, 15]
        for delay in backoffs:
            time.sleep(delay)
            if self.try_reaching_master():
                return
        raise RuntimeError("failed to ever reach master")

    def try_reaching_master(self) -> bool:
        endpoint = "/api/v1/me"
        url = f"{self.base_url}{endpoint}"
        req = urllib.request.Request(url, headers=self.headers)
        try:
            with urllib.request.urlopen(req, context=self.context) as resp:
                _ = resp.read()
            return True
        except Exception as e:
            # urllib stacktraces are awful, so in cases where we can tell what happened, avoid
            # printing it at all.
            exc_info = True
            detail = (
                "This may be due to an address resolution problem, a certificate problem, a "
                "firewall problem, a proxy problem, or some other networking error."
            )
            if isinstance(e, urllib.error.HTTPError):
                detail = f"GET {url} returned status code: {e.code} and reason: {e.reason}."
                exc_info = False
            elif isinstance(e, urllib.error.URLError):
                if isinstance(e.reason, socket.gaierror):
                    detail = f"Name lookup for {self.master_url} failed: {e.reason}"
                    exc_info = False
                elif isinstance(e.reason, ConnectionRefusedError):
                    detail = f"The connection to {self.master_url} was refused."
                    exc_info = False
                elif isinstance(e.reason, ssl.SSLCertVerificationError):
                    detail = f"There was a tls verification error: {e.reason.verify_message}."
                    exc_info = False

            if self.master_url.lower().startswith("https"):
                proxy_name_lower = "https_proxy"
                proxy_name_upper = "HTTPS_PROXY"
            else:
                proxy_name_lower = "http_proxy"
                proxy_name_upper = "HTTP_PROXY"

            proxy_value_lower = os.environ.get(proxy_name_lower)
            proxy_value_upper = os.environ.get(proxy_name_upper)

            no_proxy_value_lower = os.environ.get("no_proxy")
            no_proxy_value_upper = os.environ.get("NO_PROXY")

            logging.error(
                f"Unable to reach the master at DET_MASTER={self.master_url}.  {detail}\n"
                "Debug information:\n"
                f"    master_url: {self.master_url}\n"
                f"    endpoint: {url}\n"
                f"    tls_verify_name: {self.cert_name or None}\n"
                f"    tls_noverify: {self.noverify}\n"
                f"    tls_cert: {self.cert_content}\n"
                f"    {proxy_name_lower}: {proxy_value_lower}\n"
                f"    {proxy_name_upper}: {proxy_value_upper}\n"
                f"    no_proxy: {no_proxy_value_lower}\n"
                f"    NO_PROXY: {no_proxy_value_upper}\n",
                exc_info=exc_info,
            )
            return False


class Waiter(threading.Thread):
    """
    Waiter calls p.wait() on a process, that's it.
    """

    def __init__(self, p: subprocess.Popen, doneq: queue.Queue):
        self.p = p
        self.doneq = doneq
        super().__init__()

    def run(self) -> None:
        try:
            exit_code = self.p.wait()
            self.doneq.put(DoneMsg("waiter", exit_code=exit_code, error=None))
        except Exception as e:
            self.doneq.put(DoneMsg("waiter", exit_code=None, error=e))


def main(
    master_url: str,
    cert_name: str,
    cert_file: str,
    metadata: Dict[str, str],
    token: str,
    emit_stdout_logs: bool,
    cmd: List[str],
    log_wait_time: int,
) -> int:
    logq: queue.Queue = queue.Queue()
    doneq: queue.Queue = queue.Queue()

    waiter_started = False
    shipper_started = False

    # Normally we like structured concurrency; i.e. a function that owns a thread must not exit
    # until that thead has been properly cleaned up.  However, it is important that the log shipper
    # is not allowed to keep a task container alive too long after the child process has exited.  We
    # want to guarantee that we exit about DET_LOG_WAIT_TIME seconds after the child process exits.
    #
    # However, interruping a synchronous HTTP call form urllib is nearly impossible; even if you
    # were to select() until the underlying file descriptor had something to read before calling
    # Request.read(), there are many buffered readers in there and most likely multiple os.read()
    # calls would occur and you'd be blocking anyway.
    #
    # So as an easy workaround, we set daemon=True and just exit the process if it's not done on
    # time.
    shipper = Shipper(master_url, token, cert_name, cert_file, logq, doneq, daemon=True)
    shipper_timed_out = False

    shipper.assert_master_is_reachable()

    # Start the process or ship a special log message to the master why we couldn't.
    try:
        # Don't rely on Popen's standard line buffering; we want to do our own line buffering.
        p = subprocess.Popen(
            cmd, stdin=subprocess.DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0
        )
    except FileNotFoundError:
        shipper.ship_special(f"FileNotFoundError executing {cmd}", metadata, emit_stdout_logs)
        # 127 is the standard bash exit code for file-not-found.
        return 127
    except PermissionError:
        # Unable to read or to execute the command.
        shipper.ship_special(f"PermissionError executing {cmd}", metadata, emit_stdout_logs)
        # 126 is the standard bash exit code for permission failure.
        return 126
    except Exception:
        msg = f"unexpected failure executing {cmd}:\n" + traceback.format_exc()
        shipper.ship_special(msg, metadata, emit_stdout_logs)
        # 80 is the exit code we use to signal "ship_logs.py failed"
        return 80

    # Just for mypy.
    assert isinstance(p.stdout, io.RawIOBase) and isinstance(p.stderr, io.RawIOBase)

    try:
        waiter = Waiter(p, doneq)
        waiter.start()
        waiter_started = True

        # Set up signal forwarding.
        def signal_passthru(signum: Any, frame: Any):
            p.send_signal(signum)

        for sig in [
            signal.SIGINT,
            signal.SIGTERM,
            signal.SIGHUP,
            signal.SIGUSR1,
            signal.SIGUSR2,
            signal.SIGWINCH,
        ]:
            signal.signal(sig, signal_passthru)

        # Note: run Collectors with daemon=True, due to the Orphaned Grandchild problem (see below).
        stdout = Collector(p.stdout, "stdout", emit_stdout_logs, metadata, logq, doneq, daemon=True)
        stderr = Collector(p.stderr, "stderr", emit_stdout_logs, metadata, logq, doneq, daemon=True)

        stdout.start()

        stderr.start()

        shipper.start()
        shipper_started = True

        # Expect 4 messages on the doneq, one for each thread we started.
        #
        # Ideally, we expect all four threads (two Collectors, one Shipper, and one Waiter) to
        # finish nicely around the same time.  But there are two problems we must deal with:
        #
        #   - The Orphaned Grandchild Problem
        #
        #     If the user code starts and orphans a child process with access to our stdout and
        #     stderr, then our Collectors can potentially be held open an indefinite amount of time
        #     after the main subprocess exits (and the Waiter thread finishes).  The way docker
        #     normally treats this case is that when the main process exits, every process in the
        #     container namespace is killed immediately.  Our analog of that behavior is: after the
        #     main process exits, we do not care about any logs from any orphaned grandchildren.
        #
        #     Therefore, after the main process exits, we wait up to 1 second for the Collector
        #     threads to finish before we decide to ignore them completely.  The only reason we even
        #     wait 1 second is to be sure not to drop logs from a well-behaved main subprocess.
        #
        #   - The Unreachable Master Problem
        #
        #     It is possible the master is temporarily not receiving logs for some reason.  This
        #     would cause the Shipper thread to not finish promptly.  This is not the fault of user
        #     code, and we want to try hard to preserve all logs from user code, so we wait a
        #     total of DET_LOG_WAIT_TIME after the Waiter finishes for the shipper to keep trying.
        exit_code = None
        stdout_done = False
        stderr_done = False
        shipper_done = False
        collector_deadline: Optional[float] = None
        shipper_deadline: Optional[float] = None
        while exit_code is None or not stdout_done or not stderr_done or not shipper_done:
            # Wait for an event, possibly with a deadline (if the child process already exited).
            try:
                if exit_code is None:
                    timeout = None
                else:
                    # Pick the shortest deadline and calculate a timeout.
                    assert shipper_deadline
                    deadline = shipper_deadline
                    if collector_deadline is not None:
                        deadline = min(collector_deadline, shipper_deadline)
                    timeout = deadline - time.time()
                    if timeout <= 0:
                        raise queue.Empty()
                donemsg = doneq.get(timeout=timeout)
                assert isinstance(donemsg, DoneMsg)
            except queue.Empty:
                # We hit a deadline.
                if shipper_deadline and shipper_deadline < time.time():
                    # Abandon Ship(per)!
                    shipper_timed_out = True
                    logging.error(
                        f"waited {log_wait_time} seconds for shipper to finish after child exit; "
                        "giving up now"
                    )
                    break
                if collector_deadline and collector_deadline < time.time():
                    # Make the Shipper stop waiting on the Collectors.
                    stdout.logq.put(None)
                    stderr.logq.put(None)
                    # Also we won't wait for them either.
                    stdout_done = True
                    stderr_done = True
                    # Keep going without collector_deadline.
                    collector_deadline = None
                    continue

            if donemsg.error is not None:
                # Something in our shipping machinery broke.
                raise RuntimeError(
                    f"failure in log shipper; {donemsg.who} thread died"
                ) from donemsg.error

            if donemsg.who == "stdout":
                stdout_done = True
            elif donemsg.who == "stderr":
                stderr_done = True
            elif donemsg.who == "shipper":
                shipper_done = True
            elif donemsg.who == "waiter":
                # After the main subprocess exits, the shipping code is on a deadline.
                #
                # Note: we could almost just return exit_code here and let the cleanup and shipping
                # timeout happen in the finally block, but the finally block can't easily detect
                # exceptions that arrive on the doneq, so we stay in this while loop until we're
                # confident that either all threads shut down without error, or at least that the
                # only threads left alive are the Collectors, in an Orphaned Granchild case.
                collector_deadline = time.time() + 1
                shipper_deadline = time.time() + log_wait_time
                exit_code = donemsg.exit_code

        # Mypy doesn't know that we're guaranteed to have an exit_code by now.
        #
        # It's guaranteed because the only time deadlines are set is after the exit_code is set.
        assert exit_code is not None

        # Convert signal exits to standard bash-style exits.
        if exit_code < 0:
            return 128 - exit_code
        return exit_code

    finally:
        # If our logging infrastructure ever crashes, just give up on the child process.
        p.kill()
        if waiter_started:
            waiter.join()

        # Note: We never join on the Collector threads, because of the Orphaned Grandchild problem.

        # Even after there are no new logs to ship, the shipper could still be in a retry loop for a
        # long time, so we wait up to DET_LOG_WAIT_TIME seconds for it to exit, then we give up.
        if shipper_started and not shipper_timed_out:
            shipper.join(timeout=log_wait_time)
            if shipper.is_alive():
                # The timeout was reached.
                logging.error(
                    f"waited {log_wait_time} seconds for shipper to finish after crash; "
                    "giving up now"
                )


def configure_escape_hatch(dirpath: str) -> None:
    """
    Even if the log shipper goes belly-up, dump logs to a bind-mounted path.

    If the log shipper is failing in production, you obviously can't expect to find those logs in
    task logs, so this allows a user or support person to mount a directory into a container in
    order to find out why the log shipper is broken.
    """

    try:
        hostname = os.environ["DET_AGENT_ID"]
    except Exception:
        try:
            import socket

            hostname = socket.gethostname()
        except Exception:
            hostname = "unknown"
    # You can't run logging.basicConfig() twice so we manually add a second handler at the root
    # logging level.
    fh = logging.FileHandler(
        filename=os.path.join(ship_logs_path, f"{hostname}-{time.time()}.log"),
        # Only create the file if we actually log to it (aka if there's an error).  That way if
        # there's lots of processes not failing, we don't create tons of empty files.
        delay=False,
    )
    fh.setFormatter(logging.Formatter(LOG_FORMAT))
    logging.getLogger().addHandler(fh)

    try:
        # Touch a single file to indicate that the escape hatch is working, so that in debugging
        # someone can distinguish "the escape hatch isn't working" from "ship_logs just isn't
        # hitting any errors".
        with open(os.path.join(ship_logs_path, "ship-logs-ran"), "w"):
            pass
    except Exception:
        pass


if __name__ == "__main__":
    try:
        logging.basicConfig(
            format=LOG_FORMAT,
            stream=sys.stderr,
        )

        ship_logs_path = os.environ.get("DET_SHIP_LOGS_PATH", "/ship_logs")
        if os.path.exists(ship_logs_path):
            configure_escape_hatch(ship_logs_path)

        master_url = os.environ["DET_MASTER"]
        cert_name = os.environ.get("DET_MASTER_CERT_NAME", "")
        cert_file = os.environ.get("DET_MASTER_CERT_FILE", "")
        # TODO(rb): fix DET_USER_TOKEN to support tokens with lifetimes tied to an allocation, and
        # use DET_USER_TOKEN here instead.
        token = os.environ["DET_SESSION_TOKEN"]
        raw_metadata = os.environ["DET_TASK_LOGGING_METADATA"]
        try:
            metadata = json.loads(raw_metadata)
            assert isinstance(metadata, dict)
        except Exception:
            raise ValueError(f"invalid DET_TASK_LOGGING_METADATA: '{raw_metadata}'") from None

        metadata["container_id"] = os.environ.get("DET_CONTAINER_ID", "")
        metadata["agent_id"] = os.environ.get("DET_AGENT_ID", "")

        raw_log_wait_time = os.environ.get("DET_LOG_WAIT_TIME", "30")
        try:
            log_wait_time = int(raw_log_wait_time)
        except Exception:
            raise ValueError(f"invalid DET_LOG_WAIT_TIME: '{raw_log_wait_time}'") from None

        emit_stdout_logs = bool(os.environ.get("DET_SHIPPER_EMIT_STDOUT_LOGS"))

        metadata["source"] = "task"

        exit_code = main(
            master_url,
            cert_name,
            cert_file,
            metadata,
            token,
            emit_stdout_logs,
            cmd=sys.argv[1:],
            log_wait_time=log_wait_time,
        )
    except Exception:
        logging.error("ship_logs.py crashed!", exc_info=True)
        sys.exit(80)

    sys.exit(exit_code)
