// Copyright 2021 Redpanda Data, Inc.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.md
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0

#include "cluster/archival/archival_metadata_stm.h"

#include "base/vlog.h"
#include "bytes/iobuf.h"
#include "bytes/iostream.h"
#include "cloud_storage/partition_manifest.h"
#include "cloud_storage/partition_manifest_downloader.h"
#include "cloud_storage/remote.h"
#include "cloud_storage/remote_path_provider.h"
#include "cloud_storage/types.h"
#include "cluster/archival/logger.h"
#include "cluster/errc.h"
#include "cluster/logger.h"
#include "cluster/prefix_truncate_record.h"
#include "cluster/types.h"
#include "config/configuration.h"
#include "container/fragmented_vector.h"
#include "features/feature_table.h"
#include "model/fundamental.h"
#include "model/metadata.h"
#include "model/record.h"
#include "model/record_batch_types.h"
#include "model/record_utils.h"
#include "model/timeout_clock.h"
#include "raft/consensus.h"
#include "raft/persisted_stm.h"
#include "resource_mgmt/io_priority.h"
#include "serde/envelope.h"
#include "serde/rw/envelope.h"
#include "serde/rw/iobuf.h"
#include "serde/rw/optional.h"
#include "serde/rw/vector.h"
#include "ssx/future-util.h"
#include "ssx/semaphore.h"
#include "storage/ntp_config.h"
#include "storage/record_batch_builder.h"
#include "storage/record_batch_utils.h"
#include "utils/named_type.h"

#include <seastar/core/coroutine.hh>
#include <seastar/core/do_with.hh>
#include <seastar/core/future.hh>
#include <seastar/core/loop.hh>
#include <seastar/core/lowres_clock.hh>
#include <seastar/core/semaphore.hh>
#include <seastar/core/shared_future.hh>
#include <seastar/core/sleep.hh>
#include <seastar/core/sstring.hh>
#include <seastar/util/bool_class.hh>
#include <seastar/util/defer.hh>

#include <algorithm>
#include <exception>
#include <optional>
#include <utility>

namespace cluster {

namespace {

using cmd_key = named_type<uint8_t, struct cmd_key_tag>;

} // namespace

struct archival_metadata_stm::segment
  : public serde::
      envelope<segment, serde::version<1>, serde::compat_version<0>> {
    // ntp_revision is needed to reconstruct full remote path of
    // the segment. Deprecated because ntp_revision is now part of
    // segment_meta.
    model::initial_revision_id ntp_revision_deprecated;
    cloud_storage::segment_name name;
    cloud_storage::partition_manifest::segment_meta meta;
    // Flag is set to true if the segment was validated before being added.
    // Segment meta value generated by old redpanda versions will have the
    // default value of 'no'.
    segment_validated is_validated{segment_validated::no};

    auto serde_fields() {
        return std::tie(ntp_revision_deprecated, name, meta, is_validated);
    }
};

struct archival_metadata_stm::start_offset
  : public serde::
      envelope<start_offset, serde::version<0>, serde::compat_version<0>> {
    model::offset start_offset;
    auto serde_fields() { return std::tie(start_offset); }
};

struct archival_metadata_stm::start_offset_with_delta
  : public serde::envelope<
      start_offset_with_delta,
      serde::version<0>,
      serde::compat_version<0>> {
    model::offset start_offset;
    model::offset_delta delta;
    auto serde_fields() { return std::tie(start_offset, delta); }
};

struct archival_metadata_stm::add_segment_cmd {
    static constexpr cmd_key key{0};

    using value = segment;
};

struct archival_metadata_stm::truncate_cmd {
    static constexpr cmd_key key{1};

    using value = start_offset;
};

struct archival_metadata_stm::update_start_offset_cmd {
    static constexpr cmd_key key{2};

    using value = start_offset;
};

struct archival_metadata_stm::cleanup_metadata_cmd {
    static constexpr cmd_key key{3};
};

struct archival_metadata_stm::mark_clean_cmd {
    static constexpr cmd_key key{4};

    using value = model::offset;
};

struct archival_metadata_stm::truncate_archive_init_cmd {
    static constexpr cmd_key key{5};

    using value = start_offset_with_delta;
};

struct archival_metadata_stm::truncate_archive_commit_cmd {
    static constexpr cmd_key key{6};

    struct value
      : serde::envelope<value, serde::version<0>, serde::compat_version<0>> {
        model::offset start_offset;
        uint64_t bytes_removed;
        auto serde_fields() { return std::tie(start_offset, bytes_removed); }
    };
};

struct archival_metadata_stm::update_start_kafka_offset_cmd {
    static constexpr cmd_key key{7};

    using value = kafka::offset;
};

struct archival_metadata_stm::reset_metadata_cmd {
    static constexpr cmd_key key{8};

    // Unused, left available in case it's useful to pass in further arguments.
    using value = iobuf;
};

struct archival_metadata_stm::spillover_cmd
  : public serde::
      envelope<spillover_cmd, serde::version<0>, serde::compat_version<0>> {
    static constexpr cmd_key key{9};

    cloud_storage::segment_meta manifest_meta;

    auto serde_fields() { return std::tie(manifest_meta); }
};

struct archival_metadata_stm::replace_manifest_cmd {
    static constexpr cmd_key key{10};

    using value = iobuf;
};

struct archival_metadata_stm::process_anomalies_cmd {
    static constexpr cmd_key key{11};

    struct value
      : serde::envelope<value, serde::version<0>, serde::compat_version<0>> {
        model::timestamp scrub_timestamp;
        std::optional<model::offset> last_scrubbed_offset;

        cloud_storage::scrub_status status;
        cloud_storage::anomalies detected;

        auto serde_fields() {
            return std::tie(
              scrub_timestamp, last_scrubbed_offset, status, detected);
        }
    };
};

struct archival_metadata_stm::reset_scrubbing_metadata {
    static constexpr cmd_key key{12};
};

struct archival_metadata_stm::update_highest_producer_id_cmd {
    static constexpr cmd_key key{13};

    using value = model::producer_id;
};

struct archival_metadata_stm::read_write_fence_cmd
  : public serde::envelope<
      read_write_fence_cmd,
      serde::version<0>,
      serde::compat_version<0>> {
    static constexpr cmd_key key{14};

    model::offset last_applied_offset;

    auto serde_fields() { return std::tie(last_applied_offset); }
};

// Serde format description
// v5
//  - add apply_offset field
//
struct archival_metadata_stm::snapshot
  : public serde::
      envelope<snapshot, serde::version<5>, serde::compat_version<0>> {
    /// List of segments
    fragmented_vector<segment> segments;
    /// List of replaced segments
    fragmented_vector<segment> replaced;
    /// Start offset (might be different from the base offset of the first
    /// segment). Default value means that the snapshot was old and didn't
    /// have start_offset. In this case we need to set it to compute it from
    /// segments.
    model::offset start_offset;
    /// Last uploaded offset (default value means that the snapshot was created
    /// using older version (snapshot v0) and we need to rebuild the offset from
    /// segments)
    model::offset last_offset;
    /// Last uploaded offset belonging to a compacted segment. If set to
    /// default, the next upload attempt will align this with start of manifest.
    model::offset last_uploaded_compacted_offset;
    /// If dirty, then upload to the remote object store is necessary since the
    /// last changes to this local state machine.
    state_dirty dirty{state_dirty::clean};
    /// First accessible offset of the 'archve' (default if there is no archive)
    model::offset archive_start_offset;
    /// Delta value of the first accessible offset in the archive. We need this
    /// value to be able to provide correct start kafka offset.
    model::offset_delta archive_start_offset_delta;
    // First offset of the 'archive'. Segments below 'archive_start_offset' are
    // collectible by the archive housekeeping.
    model::offset archive_clean_offset;
    // Size of the archive section of the partition
    uint64_t archive_size_bytes;
    // Start kafka offset override (set to min() by default and to some value
    // when DeleteRecords was used to override)
    kafka::offset start_kafka_offset;
    // List of spillover manifests
    fragmented_vector<segment> spillover_manifests;
    // Timestamp of last completed scrub
    model::timestamp last_partition_scrub;
    // Offest at which the previous scrubbing stopped
    std::optional<model::offset> last_scrubbed_offset;
    // Anomalies detected by the scrubber
    cloud_storage::anomalies detected_anomalies;
    // Highest producer ID used by this partition.
    model::producer_id highest_producer_id;
    // Offset of the last applied command
    model::offset applied_offset;

    auto serde_fields() {
        return std::tie(
          segments,
          replaced,
          start_offset,
          last_offset,
          last_uploaded_compacted_offset,
          dirty,
          archive_start_offset,
          archive_start_offset_delta,
          archive_clean_offset,
          archive_size_bytes,
          start_kafka_offset,
          spillover_manifests,
          last_partition_scrub,
          last_scrubbed_offset,
          detected_anomalies,
          highest_producer_id,
          applied_offset);
    }
};

inline archival_metadata_stm::segment
segment_from_meta(const cloud_storage::segment_meta& meta) {
    auto name = cloud_storage::generate_local_segment_name(
      meta.base_offset, meta.segment_term);
    return archival_metadata_stm::segment{
      .ntp_revision_deprecated = meta.ntp_revision,
      .name = std::move(name),
      .meta = meta};
}

command_batch_builder::command_batch_builder(
  archival_metadata_stm& stm,
  ss::lowres_clock::time_point deadline,
  ss::abort_source& as)
  : _stm(stm)
  , _builder(model::record_batch_type::archival_metadata, model::offset(0))
  , _deadline(deadline)
  , _as(as)
  , _holder(stm._gate) {}

command_batch_builder& command_batch_builder::reset_metadata() {
    iobuf key_buf = serde::to_iobuf(
      archival_metadata_stm::reset_metadata_cmd::key);
    iobuf empty_buf;
    _builder.add_raw_kv(std::move(key_buf), std::move(empty_buf));
    return *this;
}

command_batch_builder& command_batch_builder::add_segments(
  std::vector<cloud_storage::segment_meta> add_segments,
  segment_validated is_validated) {
    for (auto& meta : add_segments) {
        iobuf key_buf = serde::to_iobuf(
          archival_metadata_stm::add_segment_cmd::key);
        if (meta.ntp_revision == model::initial_revision_id{}) {
            meta.ntp_revision = _stm.get()._manifest->get_revision_id();
        }
        auto record_val = archival_metadata_stm::add_segment_cmd::value{
          segment_from_meta(meta)};
        record_val.is_validated = is_validated;
        iobuf val_buf = serde::to_iobuf(std::move(record_val));
        _builder.add_raw_kv(std::move(key_buf), std::move(val_buf));
    }
    return *this;
}

command_batch_builder& command_batch_builder::cleanup_metadata() {
    // NOTE: the method doesn't check if the manifest has any data to cleanup.
    // This is needed because the cleanup_metadata_cmd command can be batched
    // together with other commands which will create some garbage to cleanup.
    iobuf key_buf = serde::to_iobuf(
      archival_metadata_stm::cleanup_metadata_cmd::key);
    iobuf empty_body;
    _builder.add_raw_kv(std::move(key_buf), std::move(empty_body));
    return *this;
}

command_batch_builder&
command_batch_builder::replace_manifest(iobuf replacement) {
    iobuf key_buf = serde::to_iobuf(
      archival_metadata_stm::replace_manifest_cmd::key);
    _builder.add_raw_kv(std::move(key_buf), std::move(replacement));
    return *this;
}

command_batch_builder& command_batch_builder::process_anomalies(
  model::timestamp scrub_timestamp,
  std::optional<model::offset> last_scrubbed_offset,
  cloud_storage::scrub_status status,
  cloud_storage::anomalies detected) {
    iobuf key_buf = serde::to_iobuf(
      archival_metadata_stm::process_anomalies_cmd::key);
    auto record_val = archival_metadata_stm::process_anomalies_cmd::value{
      .scrub_timestamp = scrub_timestamp,
      .last_scrubbed_offset = last_scrubbed_offset,
      .status = status,
      .detected = std::move(detected)};
    _builder.add_raw_kv(
      std::move(key_buf), serde::to_iobuf(std::move(record_val)));
    return *this;
}

command_batch_builder& command_batch_builder::reset_scrubbing_metadata() {
    iobuf key_buf = serde::to_iobuf(
      archival_metadata_stm::reset_scrubbing_metadata::key);
    _builder.add_raw_kv(std::move(key_buf), std::nullopt);
    return *this;
}

command_batch_builder&
command_batch_builder::mark_clean(model::offset clean_at) {
    iobuf key_buf = serde::to_iobuf(archival_metadata_stm::mark_clean_cmd::key);
    iobuf val_buf = serde::to_iobuf(clean_at);
    _builder.add_raw_kv(std::move(key_buf), std::move(val_buf));
    return *this;
}

command_batch_builder&
command_batch_builder::truncate(model::offset start_rp_offset) {
    iobuf key_buf = serde::to_iobuf(
      archival_metadata_stm::update_start_offset_cmd::key);
    auto record_val = archival_metadata_stm::update_start_offset_cmd::value{
      .start_offset = start_rp_offset};
    iobuf val_buf = serde::to_iobuf(record_val);
    _builder.add_raw_kv(std::move(key_buf), std::move(val_buf));
    return *this;
}

command_batch_builder& command_batch_builder::update_start_kafka_offset(
  kafka::offset start_kafka_offset) {
    iobuf key_buf = serde::to_iobuf(
      archival_metadata_stm::update_start_kafka_offset_cmd::key);
    auto record_val
      = archival_metadata_stm::update_start_kafka_offset_cmd::value{
        start_kafka_offset};
    iobuf val_buf = serde::to_iobuf(record_val);
    _builder.add_raw_kv(std::move(key_buf), std::move(val_buf));
    return *this;
}

command_batch_builder&
command_batch_builder::spillover(const cloud_storage::segment_meta& meta) {
    iobuf key_buf = serde::to_iobuf(archival_metadata_stm::spillover_cmd::key);
    auto record_val = archival_metadata_stm::spillover_cmd{
      .manifest_meta = meta,
    };
    iobuf val_buf = serde::to_iobuf(record_val);
    _builder.add_raw_kv(std::move(key_buf), std::move(val_buf));
    return *this;
}

command_batch_builder& command_batch_builder::truncate_archive_init(
  model::offset start_rp_offset, model::offset_delta delta) {
    iobuf key_buf = serde::to_iobuf(
      archival_metadata_stm::truncate_archive_init_cmd::key);
    auto record_val = archival_metadata_stm::truncate_archive_init_cmd::value{
      .start_offset = start_rp_offset, .delta = delta};
    iobuf val_buf = serde::to_iobuf(record_val);
    _builder.add_raw_kv(std::move(key_buf), std::move(val_buf));
    return *this;
}

command_batch_builder& command_batch_builder::cleanup_archive(
  model::offset start_rp_offset, uint64_t bytes_removed) {
    iobuf key_buf = serde::to_iobuf(
      archival_metadata_stm::truncate_archive_commit_cmd::key);
    auto record_val = archival_metadata_stm::truncate_archive_commit_cmd::value{
      .start_offset = start_rp_offset, .bytes_removed = bytes_removed};
    iobuf val_buf = serde::to_iobuf(record_val);
    _builder.add_raw_kv(std::move(key_buf), std::move(val_buf));
    return *this;
}

command_batch_builder&
command_batch_builder::read_write_fence(model::offset offset) {
    iobuf key_buf = serde::to_iobuf(
      archival_metadata_stm::read_write_fence_cmd::key);
    auto record_val = archival_metadata_stm::read_write_fence_cmd{
      .last_applied_offset = offset};
    iobuf val_buf = serde::to_iobuf(record_val);
    _builder.add_raw_kv(std::move(key_buf), std::move(val_buf));
    return *this;
}

command_batch_builder& command_batch_builder::update_highest_producer_id(
  model::producer_id highest_pid) {
    if (highest_pid != model::producer_id{}) {
        iobuf key_buf = serde::to_iobuf(
          archival_metadata_stm::update_highest_producer_id_cmd::key);
        iobuf val_buf = serde::to_iobuf(highest_pid());
        _builder.add_raw_kv(std::move(key_buf), std::move(val_buf));
    }
    return *this;
}

ss::future<std::error_code> command_batch_builder::replicate() {
    _as.check();

    auto units = co_await _stm.get()._lock.get_units(_as);
    auto holder = _stm.get()._gate.hold();

    vlog(_stm.get()._logger.debug, "command_batch_builder::replicate called");
    auto now = ss::lowres_clock::now();
    auto timeout = now < _deadline ? _deadline - now : 0ms;

    // Block on syncing the STM.
    auto did_sync = co_await _stm.get().do_sync(timeout, &_as);
    if (!did_sync) {
        co_return errc::not_leader;
    }

    auto batch = std::move(_builder).build();
    auto f = _stm.get()
               .do_replicate_commands(std::move(batch), _as)
               .finally([u = std::move(units), h = std::move(holder)] {});

    // The above do_replicate_commands call is not cancellable at every point
    // due to the guarantees we need from the operation for linearizability. To
    // respect callers' cancellation requests, we wrap the future in a
    // cancellable future but leave the operation running.
    //
    // The operation can continue safely in background because it holds the
    // lock and the gate. The lock also ensures that no concurrent replicate
    // calls can be made and we won't leak continuations.
    co_return co_await ssx::with_timeout_abortable(
      std::move(f), model::no_timeout, _as);
}

command_batch_builder archival_metadata_stm::batch_start(
  ss::lowres_clock::time_point deadline, ss::abort_source& as) {
    return {*this, deadline, as};
}

fragmented_vector<archival_metadata_stm::segment>
archival_metadata_stm::segments_from_manifest(
  const cloud_storage::partition_manifest& manifest) {
    fragmented_vector<segment> segments;
    for (auto meta : manifest) {
        if (meta.ntp_revision == model::initial_revision_id{}) {
            meta.ntp_revision = manifest.get_revision_id();
        }
        // NOTE: manifest should have the 'segment_term' set to some
        // meaningful value in this place. During deserialization from
        // json it's set from the segment name (if it's not present in
        // the segment_meta). During deserialization of the archival snapshot
        // it's also initialized from the segment name if it's missing in
        // metadata.
        vassert(
          meta.segment_term != model::term_id{},
          "segment_term is invalid in segment with base offset {}",
          meta.base_offset);
        segments.push_back(segment_from_meta(meta));
    }

    return segments;
}

fragmented_vector<archival_metadata_stm::segment>
archival_metadata_stm::replaced_segments_from_manifest(
  const cloud_storage::partition_manifest& manifest) {
    auto replaced = manifest.replaced_segments();
    fragmented_vector<segment> segments;
    for (auto meta : replaced) {
        if (meta.ntp_revision == model::initial_revision_id{}) {
            meta.ntp_revision = manifest.get_revision_id();
        }
        segments.push_back(segment_from_meta(meta));
    }

    return segments;
}

fragmented_vector<archival_metadata_stm::segment>
archival_metadata_stm::spillover_from_manifest(
  const cloud_storage::partition_manifest& manifest) {
    const auto& sp_list = manifest.get_spillover_map();
    fragmented_vector<segment> res;
    for (auto meta : sp_list) {
        res.push_back(segment_from_meta(meta));
    }
    return res;
}

ss::circular_buffer<model::record_batch>
archival_metadata_stm::serialize_manifest_as_batches(
  model::offset base_offset, const cloud_storage::partition_manifest& m) {
    static constexpr int records_per_batch
      = 100; // this will give us around 10K per batch
    std::optional<storage::record_batch_builder> bb;
    bb.emplace(model::record_batch_type::archival_metadata, base_offset);
    ss::circular_buffer<model::record_batch> result;
    int batch_size = 0;
    for (auto meta : m) {
        iobuf key_buf = serde::to_iobuf(add_segment_cmd::key);
        if (meta.ntp_revision == model::initial_revision_id{}) {
            meta.ntp_revision = m.get_revision_id();
        }
        auto record_val = add_segment_cmd::value{segment_from_meta(meta)};
        iobuf val_buf = serde::to_iobuf(std::move(record_val));
        bb->add_raw_kv(std::move(key_buf), std::move(val_buf));
        if (++batch_size >= records_per_batch) {
            result.push_back(std::move(bb.value()).build());
            base_offset = base_offset + model::offset(batch_size);
            bb.emplace(
              model::record_batch_type::archival_metadata, base_offset);
            batch_size = 0;
        }
    }
    if (!bb->empty()) {
        result.push_back(std::move(bb.value()).build());
    }
    return result;
}

ss::future<> archival_metadata_stm::create_log_segment_with_config_batches(
  const storage::ntp_config& ntp_cfg,
  model::offset base_offset,
  model::term_id term,
  const cloud_storage::partition_manifest& manifest) {
    auto path = storage::segment_full_path(
      ntp_cfg, base_offset, term, storage::record_version_type::v1);

    auto archival_batches
      = cluster::archival_metadata_stm::serialize_manifest_as_batches(
        base_offset, manifest);
    try {
        auto parent
          = std::filesystem::path(path.string()).parent_path().native();
        vlog(
          clusterlog.debug,
          "Creating the segment file with the path {}",
          path.string());
        auto handle = co_await ss::open_file_dma(
          path.string(), ss::open_flags::rw | ss::open_flags::create);
        auto h = ss::defer(
          [handle]() mutable { ssx::background = handle.close(); });
        auto stream = co_await ss::make_file_output_stream(handle);
        for (auto& b : archival_batches) {
            b.header().header_crc = model::internal_header_only_crc(b.header());
            vlog(clusterlog.debug, "Writing archival batch {}", b.header());
            auto buffer = std::make_unique<iobuf>(
              storage::batch_header_to_disk_iobuf(b.header()));
            buffer->append(std::move(b).release_data());
            auto batch_stream = make_iobuf_input_stream(std::move(*buffer));
            co_await ss::copy(batch_stream, stream);
        }
        co_await stream.flush();
    } catch (...) {
        vlog(
          clusterlog.error,
          "Failed to create a log segment, {}",
          std::current_exception());
        throw;
    }
}
/**
 * Create a snapshot based off some clean state we obtained out of band, for
 * example during topic recovery.
 */
ss::future<> archival_metadata_stm::make_snapshot(
  const storage::ntp_config& ntp_cfg,
  const cloud_storage::partition_manifest& m,
  model::offset insync_offset) {
    // Create archival_stm_snapshot
    auto segments = segments_from_manifest(m);
    auto replaced = replaced_segments_from_manifest(m);
    auto spillover = spillover_from_manifest(m);
    iobuf snap_data = serde::to_iobuf(snapshot{
      .segments = std::move(segments),
      .replaced = std::move(replaced),
      .start_offset = m.get_start_offset().value_or(model::offset{}),
      .last_offset = m.get_last_offset(),
      .last_uploaded_compacted_offset = m.get_last_uploaded_compacted_offset(),
      .dirty = state_dirty::clean,
      .archive_start_offset = m.get_archive_start_offset(),
      .archive_start_offset_delta = m.get_archive_start_offset_delta(),
      .archive_clean_offset = m.get_archive_clean_offset(),
      .archive_size_bytes = m.archive_size_bytes(),
      .start_kafka_offset = m.get_start_kafka_offset_override(),
      .spillover_manifests = std::move(spillover),
      .highest_producer_id = m.highest_producer_id(),
      .applied_offset = m.get_applied_offset(),
    });

    auto snapshot = raft::stm_snapshot::create(
      0, insync_offset, std::move(snap_data));

    storage::simple_snapshot_manager tmp_snapshot_mgr(
      std::filesystem::path(ntp_cfg.work_directory()),
      archival_stm_snapshot,
      raft_priority());

    co_await raft::file_backed_stm_snapshot::persist_local_snapshot(
      tmp_snapshot_mgr, std::move(snapshot));
}

ss::future<bool>
archival_metadata_stm::has_snapshot(const storage::ntp_config& ntp_cfg) {
    storage::simple_snapshot_manager tmp_snapshot_mgr(
      std::filesystem::path(ntp_cfg.work_directory()),
      archival_stm_snapshot,
      raft_priority());
    co_return co_await tmp_snapshot_mgr.snapshot_exists();
}

archival_metadata_stm::archival_metadata_stm(
  raft::consensus* raft,
  cloud_storage::remote& remote,
  features::feature_table& ft,
  ss::logger& logger,
  std::optional<cloud_storage::remote_label> remote_label,
  std::optional<model::topic_namespace> remote_topic_namespace_override)
  : raft::persisted_stm<>(archival_stm_snapshot, logger, raft)
  , _logger(logger, ssx::sformat("ntp: {}", raft->ntp()))
  , _mem_tracker(ss::make_shared<util::mem_tracker>(raft->ntp().path()))
  , _manifest(ss::make_shared<cloud_storage::partition_manifest>(
      raft->ntp(), raft->log_config().get_remote_revision(), _mem_tracker))
  , _cloud_storage_api(remote)
  , _feature_table(ft)
  , _remote_path_provider(remote_label, remote_topic_namespace_override) {}

ss::future<std::error_code> archival_metadata_stm::truncate(
  model::offset start_rp_offset,
  ss::lowres_clock::time_point deadline,
  ss::abort_source& as) {
    if (start_rp_offset < get_start_offset()) {
        co_return errc::success;
    }
    auto builder = batch_start(deadline, as);
    // Replicates update_start_offset_cmd command.
    builder.truncate(start_rp_offset);
    co_return co_await builder.replicate();
}

ss::future<std::error_code> archival_metadata_stm::truncate(
  kafka::offset start_kafka_offset,
  ss::lowres_clock::time_point deadline,
  ss::abort_source& as) {
    auto builder = batch_start(deadline, as);
    builder.update_start_kafka_offset(start_kafka_offset);
    co_return co_await builder.replicate();
}

ss::future<std::error_code> archival_metadata_stm::spillover(
  const cloud_storage::segment_meta& manifest_meta,
  ss::lowres_clock::time_point deadline,
  ss::abort_source& as) {
    auto start_rp_offset = model::next_offset(manifest_meta.committed_offset);
    if (start_rp_offset < get_start_offset()) {
        co_return errc::success;
    }
    auto builder = batch_start(deadline, as);
    builder.spillover(manifest_meta);
    co_return co_await builder.replicate();
}

ss::future<std::error_code> archival_metadata_stm::truncate_archive_init(
  model::offset start_rp_offset,
  model::offset_delta delta,
  ss::lowres_clock::time_point deadline,
  ss::abort_source& as) {
    if (start_rp_offset < get_archive_start_offset()) {
        co_return errc::success;
    }
    auto builder = batch_start(deadline, as);
    builder.truncate_archive_init(start_rp_offset, delta);
    co_return co_await builder.replicate();
}

ss::future<std::error_code> archival_metadata_stm::cleanup_archive(
  model::offset start_rp_offset,
  uint64_t removed_size_bytes,
  ss::lowres_clock::time_point deadline,
  ss::abort_source& as) {
    if (start_rp_offset < get_archive_clean_offset()) {
        co_return errc::success;
    }
    auto builder = batch_start(deadline, as);
    builder.cleanup_archive(start_rp_offset, removed_size_bytes);
    co_return co_await builder.replicate();
}

ss::future<std::error_code> archival_metadata_stm::cleanup_metadata(
  ss::lowres_clock::time_point deadline, ss::abort_source& as) {
    auto builder = batch_start(deadline, as);
    builder.cleanup_metadata();
    co_return co_await builder.replicate();
}

ss::future<std::error_code> archival_metadata_stm::process_anomalies(
  model::timestamp scrub_timestamp,
  std::optional<model::offset> last_scrubbed_offset,
  cloud_storage::scrub_status status,
  cloud_storage::anomalies detected,
  ss::lowres_clock::time_point deadline,
  ss::abort_source& as) {
    auto builder = batch_start(deadline, as);
    builder.process_anomalies(
      scrub_timestamp, last_scrubbed_offset, status, std::move(detected));
    co_return co_await builder.replicate();
}

ss::future<std::optional<model::offset>>
archival_metadata_stm::sync(model::timeout_clock::duration timeout) {
    return sync(timeout, nullptr);
}

ss::future<std::optional<model::offset>> archival_metadata_stm::sync(
  model::timeout_clock::duration timeout, ss::abort_source* as) {
    return _lock
      .with(
        timeout,
        [this, timeout, as] {
            return do_sync(timeout, as).then([this](bool synced) {
                std::optional<model::offset> res;
                if (synced) {
                    res = _manifest->get_applied_offset();
                }
                return res;
            });
        })
      .handle_exception_type(
        [](const ss::semaphore_timed_out&) -> std::optional<model::offset> {
            return std::nullopt;
        });
}

ss::future<bool> archival_metadata_stm::do_sync(
  model::timeout_clock::duration timeout, ss::abort_source* as) {
    auto holder = _gate.hold();
    if (!co_await raft::persisted_stm<>::sync(timeout)) {
        co_return false;
    }

    vassert(
      !_lock.try_get_units().has_value(),
      "Attempt to replicate STM command while not under lock");
    // This mutex is held during the replication of the record batch.
    // After acquiring the mutex we can't have in-flight replication request but
    // we can have some data in the log which is not applied to the STM yet.
    // The code below simply waits until all record batches up to current
    // committed offset are applied to the STM.
    // The log eviction STM can also replicate its own batch without holding the
    // lock but it's not critical.

    auto insync = _manifest->get_insync_offset();

    // all archival commands are replicated with acks=-1
    // this is why we can use committed_offset
    auto commit = _raft->committed_offset();
    if (insync < commit) {
        if (as == nullptr) {
            co_return co_await wait_no_throw(
              commit, ss::lowres_clock::now() + timeout);
        } else {
            co_return co_await wait_no_throw(
              commit, ss::lowres_clock::now() + timeout, *as);
        }
    }
    // This should be impossible under lock
    vassert(
      _active_operation_res.has_value() == false, "Concurrency violation");
    co_return true;
}

ss::future<std::error_code> archival_metadata_stm::do_replicate_commands(
  model::record_batch batch, ss::abort_source& as) {
    // It is critical that this method does not return except in the following
    // cases:
    // 1. The batch was successfully replicated with required consistency
    //    level.
    // 2. The batch failed to replicate but the leader stepped down.
    //
    // Otherwise, it will lead to _lock and _active_operation_res being reset
    // early allowing for concurrent sync and replicate calls which will lead
    // to race conditions/corruption/undefined behavior.

    auto holder = _gate.hold();

    vassert(
      !_lock.try_get_units().has_value(),
      "Attempt to replicate STM command while not under lock");
    vassert(
      _active_operation_res.has_value() == false, "Concurrency violation");

    const auto current_term = _insync_term;

    // If the caller didn't invoke `sync` before calling `replicate` we
    // might have some batches which are not applied to the STM yet. These
    // batches could potentially set _active_operation_res and therefore
    // mask the actual failure.
    {
        auto commit = _raft->committed_offset();
        auto insync = _manifest->get_insync_offset();
        if (insync < commit) {
            vlog(_logger.debug, "Replicate is called while STM is catching up");
            auto sync_res = co_await do_sync(
              config::shard_local_cfg()
                .cloud_storage_metadata_sync_timeout_ms.value(),
              &as);
            if (!sync_res) {
                vlog(_logger.warn, "Failed to catch up");
                co_return errc::timeout;
            }
        }
    }

    // Create a promise to deliver the result of the batch application
    _active_operation_res.emplace();
    auto broken_promise_to_shutdown = [](const ss::broken_promise&) {
        return errc::shutting_down;
    };
    auto apply_result = _active_operation_res->get_future()
                          .handle_exception_type(broken_promise_to_shutdown);
    auto op_state_reset = ss::defer([&] { _active_operation_res.reset(); });

    auto opts = raft::replicate_options(raft::consistency_level::quorum_ack);
    opts.set_force_flush();

    auto result = co_await _raft->replicate(
      current_term,
      model::make_memory_record_batch_reader(std::move(batch)),
      opts);
    if (!result) {
        vlog(
          _logger.warn,
          "error on replicating remote segment metadata: {}",
          result.error());
        // If there was an error for whatever reason, it is unsafe to make
        // any assumptions about whether batches were replicated or not.
        // Explicitly step down if we're still leader and force callers to
        // re-sync in a new term with a new leader.
        if (_raft->is_leader() && _raft->term() == current_term) {
            co_await _raft->step_down(ssx::sformat(
              "failed to replicate archival batch in term {}", current_term));
        }
        co_return result.error();
    }

    auto applied = co_await wait_no_throw(
      result.value().last_offset, model::no_timeout);
    if (!applied) {
        if (as.abort_requested()) {
            co_return errc::shutting_down;
        }

        if (_raft->is_leader() && _raft->term() == current_term) {
            co_await _raft->step_down(ssx::sformat(
              "failed to replicate archival batch in term {}", current_term));
        }
        co_return errc::replication_error;
    }

    co_return co_await std::move(apply_result);
}

ss::future<std::error_code> archival_metadata_stm::mark_clean(
  ss::lowres_clock::time_point deadline,
  model::offset clean_offset,
  ss::abort_source& as) {
    auto builder = batch_start(deadline, as);
    builder.mark_clean(clean_offset);
    co_return co_await builder.replicate();
}

ss::future<std::error_code> archival_metadata_stm::add_segments(
  std::vector<cloud_storage::segment_meta> segments,
  std::optional<model::offset> clean_offset,
  model::producer_id highest_pid,
  ss::lowres_clock::time_point deadline,
  ss::abort_source& as,
  segment_validated is_validated) {
    auto now = ss::lowres_clock::now();
    auto timeout = now < deadline ? deadline - now : 0ms;
    return _lock.with(
      timeout,
      [this,
       s = std::move(segments),
       clean_offset,
       highest_pid,
       deadline,
       &as,
       is_validated]() mutable {
          return do_add_segments(
            std::move(s),
            clean_offset,
            highest_pid,
            deadline,
            as,
            is_validated);
      });
}

ss::future<std::error_code> archival_metadata_stm::do_add_segments(
  std::vector<cloud_storage::segment_meta> add_segments,
  std::optional<model::offset> clean_offset,
  model::producer_id highest_pid,
  ss::lowres_clock::time_point deadline,
  ss::abort_source& as,
  segment_validated is_validated) {
    auto holder = _gate.hold();
    {
        auto now = ss::lowres_clock::now();
        auto timeout = now < deadline ? deadline - now : 0ms;
        if (!co_await do_sync(timeout, &as)) {
            co_return errc::timeout;
        }
    }

    as.check();

    if (add_segments.empty()) {
        co_return errc::success;
    }

    storage::record_batch_builder b(
      model::record_batch_type::archival_metadata, model::offset(0));
    for (auto& meta : add_segments) {
        iobuf key_buf = serde::to_iobuf(add_segment_cmd::key);
        if (meta.ntp_revision == model::initial_revision_id{}) {
            meta.ntp_revision = _manifest->get_revision_id();
        }
        auto record_val = add_segment_cmd::value{segment_from_meta(meta)};
        record_val.is_validated = is_validated;
        iobuf val_buf = serde::to_iobuf(std::move(record_val));
        b.add_raw_kv(std::move(key_buf), std::move(val_buf));
    }

    if (clean_offset.has_value()) {
        iobuf key_buf = serde::to_iobuf(
          archival_metadata_stm::mark_clean_cmd::key);
        iobuf val_buf = serde::to_iobuf(clean_offset.value());
        b.add_raw_kv(std::move(key_buf), std::move(val_buf));
    }

    if (highest_pid != model::producer_id{}) {
        iobuf key_buf = serde::to_iobuf(
          archival_metadata_stm::update_highest_producer_id_cmd::key);
        iobuf val_buf = serde::to_iobuf(highest_pid());
        b.add_raw_kv(std::move(key_buf), std::move(val_buf));
    }

    auto batch = std::move(b).build();
    auto ec = co_await do_replicate_commands(std::move(batch), as);
    if (ec) {
        co_return ec;
    }

    for (const auto& meta : add_segments) {
        auto name = cloud_storage::generate_local_segment_name(
          meta.base_offset, meta.segment_term);
        vlog(
          _logger.info,
          "new remote segment added (name: {}, meta: {}"
          "remote start_offset: {}, last_offset: {}, highest_producer_id: {})",
          name,
          meta,
          get_start_offset(),
          get_last_offset(),
          highest_pid);
    }

    co_return errc::success;
}

ss::future<> archival_metadata_stm::do_apply(const model::record_batch& b) {
    if (
      b.header().type != model::record_batch_type::archival_metadata
      && b.header().type != model::record_batch_type::prefix_truncate) {
        // Advance in-sync offset immediately in case of data batch,
        // or the batch that belongs to another STM. If the batch is handled
        // by this STM it can only be advanced after we applied it.
        _manifest->advance_insync_offset(b.last_offset());
        co_return;
    }

    if (b.header().type == model::record_batch_type::prefix_truncate) {
        // Special case handling for prefix_truncate batches: these
        // originate in log_eviction_stm, but affect the entire partition,
        // local and cloud storage alike. Despite the record originating
        // elsewhere, note that the STM is still deterministic, as records
        // are applied in order and are not allowed to fail.
        b.for_each_record(
          [this, base_offset = b.base_offset()](model::record&& r) {
              _last_dirty_at = base_offset + model::offset{r.offset_delta()};
              _manifest->advance_applied_offset(_last_dirty_at);
              auto key = serde::from_iobuf<uint8_t>(r.release_key());
              auto val = serde::from_iobuf<prefix_truncate_record>(
                r.release_value());
              if (key == prefix_truncate_key) {
                  // The archival layer can't translate arbitrary redpanda
                  // offsets, so just pass through the Kafka offset as is.
                  apply_update_start_kafka_offset(val.kafka_start_offset);
              }
          });
    } else {
        auto on_exit = ss::defer(
          [this] { maybe_notify_waiter(errc::success); });
        try {
            b.for_each_record([this, base_offset = b.base_offset()](
                                model::record&& r) {
                auto key = serde::from_iobuf<cmd_key>(r.release_key());

                if (key != read_write_fence_cmd::key) {
                    _manifest->advance_applied_offset(
                      base_offset + model::offset{r.offset_delta()});
                }

                if (key != mark_clean_cmd::key) {
                    // All keys other than mark clean make the manifest dirty
                    _last_dirty_at = base_offset
                                     + model::offset{r.offset_delta()};
                }

                switch (key) {
                case add_segment_cmd::key:
                    apply_add_segment(serde::from_iobuf<add_segment_cmd::value>(
                      r.release_value()));
                    break;
                case truncate_cmd::key:
                    apply_truncate(serde::from_iobuf<truncate_cmd::value>(
                      r.release_value()));
                    break;
                case update_start_offset_cmd::key:
                    apply_update_start_offset(
                      serde::from_iobuf<update_start_offset_cmd::value>(
                        r.release_value()));
                    break;
                case cleanup_metadata_cmd::key:
                    apply_cleanup_metadata();
                    break;
                case mark_clean_cmd::key:
                    apply_mark_clean(serde::from_iobuf<mark_clean_cmd::value>(
                      r.release_value()));
                    break;
                case truncate_archive_init_cmd::key:
                    apply_truncate_archive_init(
                      serde::from_iobuf<truncate_archive_init_cmd::value>(
                        r.release_value()));
                    break;
                case truncate_archive_commit_cmd::key: {
                    auto cmd
                      = serde::from_iobuf<truncate_archive_commit_cmd::value>(
                        r.release_value());
                    apply_truncate_archive_commit(
                      cmd.start_offset, cmd.bytes_removed);
                } break;
                case update_start_kafka_offset_cmd::key:
                    apply_update_start_kafka_offset(
                      serde::from_iobuf<update_start_kafka_offset_cmd::value>(
                        r.release_value()));
                    break;
                case reset_metadata_cmd::key:
                    apply_reset_metadata();
                    break;
                case spillover_cmd::key:
                    apply_spillover(
                      serde::from_iobuf<spillover_cmd>(r.release_value()));
                    break;
                case replace_manifest_cmd::key:
                    apply_replace_manifest(r.release_value());
                    break;
                case process_anomalies_cmd::key:
                    apply_process_anomalies(r.release_value());
                    break;
                case reset_scrubbing_metadata::key:
                    apply_reset_scrubbing_metadata();
                    break;
                case update_highest_producer_id_cmd::key:
                    apply_update_highest_producer_id(
                      serde::from_iobuf<update_highest_producer_id_cmd::value>(
                        r.release_value()));
                    break;
                case read_write_fence_cmd::key:
                    if (apply_read_write_fence(
                          serde::from_iobuf<read_write_fence_cmd>(
                            r.release_value()))) {
                        // This means that there is a concurrency violation. The
                        // fence was created before some other command was
                        // applied. We can't apply the commands from this batch.
                        return ss::stop_iteration::yes;
                    }
                    break;
                default:
                    throw std::runtime_error(fmt_with_ctx(
                      fmt::format,
                      "Unknown archival metadata STM command {}",
                      static_cast<int>(key)));
                };
                return ss::stop_iteration::no;
            });
        } catch (...) {
            vlog(
              _logger.error,
              "Unexpected error while applying changes to "
              "archival_metadata_stm: {}",
              std::current_exception());
            on_exit.cancel();
            maybe_notify_waiter(std::current_exception());
        }
    }

    // The offset should only be advanced after all the changes are applied.
    _manifest->advance_insync_offset(b.last_offset());
}

ss::future<> archival_metadata_stm::apply_raft_snapshot(const iobuf&) {
    cloud_storage::partition_manifest new_manifest{
      _manifest->get_ntp(), _manifest->get_revision_id()};

    const auto& bucket_config
      = cloud_storage::configuration::get_bucket_config();
    auto bucket = bucket_config.value();
    vassert(
      bucket, "configuration property {} must be set", bucket_config.name());

    auto timeout
      = config::shard_local_cfg().cloud_storage_manifest_upload_timeout_ms();
    auto backoff = config::shard_local_cfg().cloud_storage_initial_backoff_ms();

    retry_chain_node rc_node(_download_as, timeout, backoff);
    cloud_storage::partition_manifest_downloader dl(
      cloud_storage_clients::bucket_name{*bucket},
      _remote_path_provider,
      _manifest->get_ntp(),
      _manifest->get_revision_id(),
      _cloud_storage_api);
    auto res = co_await dl.download_manifest(rc_node, &new_manifest);
    if (res.has_error()) {
        // sleep to the end of timeout to avoid calling handle_eviction in a
        // busy loop.
        co_await ss::sleep_abortable(rc_node.get_timeout(), _download_as);
        throw std::runtime_error{
          fmt::format("couldn't download manifest: {}", res.error())};
    }
    if (
      res.value()
      == cloud_storage::find_partition_manifest_outcome::no_matching_manifest) {
        set_next(_raft->start_offset());
        vlog(_logger.info, "handled log eviction, the manifest is absent");
        co_return;
    }

    *_manifest = std::move(new_manifest);
    // reset counter since it was related to the previous manifest
    _compacted_replaced_bytes = 0;

    auto start_offset = get_start_offset();

    auto iso = _manifest->get_insync_offset();
    if (iso == model::offset{}) {
        // Handle legacy manifests which don't have the 'insync_offset'
        // field.
        iso = _manifest->get_last_offset();
    }
    auto next_offset = std::max(_raft->start_offset(), model::next_offset(iso));
    set_next(next_offset);

    vlog(
      _logger.info,
      "handled log eviction, next offset: {}, remote start_offset: {}, "
      "last_offset: {}",
      next_offset,
      start_offset,
      get_last_offset());
}

ss::future<> archival_metadata_stm::apply_local_snapshot(
  raft::stm_snapshot_header header, iobuf&& data) {
    auto snap = serde::from_iobuf<snapshot>(std::move(data));

    if (
      snap.last_offset == model::offset{}
      || snap.start_offset == model::offset{}) {
        // Old format doesn't have start offset and last offset
        for (const auto& s : snap.segments) {
            if (snap.start_offset == model::offset{}) {
                snap.start_offset = s.meta.base_offset;
            } else {
                snap.start_offset = std::min(
                  snap.start_offset, s.meta.base_offset);
            }
            snap.last_offset = std::max(
              snap.last_offset, s.meta.committed_offset);
        }
    }
    vlog(
      _logger.info,
      "applying snapshot, so: {}, lo: {}, num segments: {}, num replaced: "
      "{}",
      snap.start_offset,
      snap.last_offset,
      snap.segments.size(),
      snap.replaced.size());

    *_manifest = cloud_storage::partition_manifest(
      _raft->ntp(),
      _raft->log_config().get_remote_revision(),
      _manifest->mem_tracker(),
      snap.start_offset,
      snap.last_offset,
      snap.last_uploaded_compacted_offset,
      header.offset,
      snap.segments,
      snap.replaced,
      snap.start_kafka_offset,
      snap.archive_start_offset,
      snap.archive_start_offset_delta,
      snap.archive_clean_offset,
      snap.archive_size_bytes,
      snap.spillover_manifests,
      snap.last_partition_scrub,
      snap.last_scrubbed_offset,
      snap.detected_anomalies,
      snap.highest_producer_id,
      snap.applied_offset);

    vlog(
      _logger.info,
      "applied snapshot at offset: {}, remote start_offset: {}, "
      "last_offset: {}, spillover map size: {}, highest_producer_id: {}",
      header.offset,
      get_start_offset(),
      get_last_offset(),
      _manifest->get_spillover_map().size(),
      _manifest->highest_producer_id());

    // reset counter, the value depended on the previous _manifest
    _compacted_replaced_bytes = 0;

    if (snap.dirty == state_dirty::dirty) {
        _last_clean_at = model::offset{0};
    } else {
        _last_clean_at = header.offset;
    }
    co_return;
}

ss::future<raft::stm_snapshot>
archival_metadata_stm::take_local_snapshot(ssx::semaphore_units apply_units) {
    auto segments = segments_from_manifest(*_manifest);
    auto replaced = replaced_segments_from_manifest(*_manifest);
    auto spillover = spillover_from_manifest(*_manifest);
    iobuf snap_data = serde::to_iobuf(snapshot{
      .segments = std::move(segments),
      .replaced = std::move(replaced),
      .start_offset = _manifest->get_start_offset().value_or(model::offset()),
      .last_offset = _manifest->get_last_offset(),
      .last_uploaded_compacted_offset
      = _manifest->get_last_uploaded_compacted_offset(),
      .dirty = get_dirty(),
      .archive_start_offset = _manifest->get_archive_start_offset(),
      .archive_start_offset_delta = _manifest->get_archive_start_offset_delta(),
      .archive_clean_offset = _manifest->get_archive_clean_offset(),
      .archive_size_bytes = _manifest->archive_size_bytes(),
      .start_kafka_offset = _manifest->get_start_kafka_offset_override(),
      .spillover_manifests = std::move(spillover),
      .last_partition_scrub = _manifest->last_partition_scrub(),
      .last_scrubbed_offset = _manifest->last_scrubbed_offset(),
      .detected_anomalies = _manifest->detected_anomalies(),
      .highest_producer_id = _manifest->highest_producer_id()});
    auto snapshot_offset = last_applied_offset();
    apply_units.return_all();

    vlog(
      _logger.debug,
      "creating snapshot at offset: {}, remote start_offset: {}, "
      "last_offset: {}",
      snapshot_offset,
      get_start_offset(),
      get_last_offset());

    co_return raft::stm_snapshot::create(
      0, snapshot_offset, std::move(snap_data));
}

model::offset archival_metadata_stm::max_collectible_offset() {
    // From Redpanda 22.3 up, the ntp_config's impression of whether
    // archival is enabled is authoritative.
    bool collect_all = !_raft->log_config().is_archival_enabled();
    bool is_read_replica = _raft->log_config().is_read_replica_mode_enabled();

    // In earlier versions, we should assume every topic is archival enabled
    // if the global cloud_storage_enable_remote_write is true.
    if (
      !_feature_table.is_active(features::feature::cloud_retention)
      && config::shard_local_cfg().cloud_storage_enable_remote_write()) {
        collect_all = false;
    }

    if (collect_all || is_read_replica) {
        // The archival is disabled but the state machine still exists so we
        // shouldn't stop eviction from happening.
        // In read-replicas the state machine exists and stores segments
        // from the remote manifest. Since nothing is uploaded there is no
        // need to interact with local retention.
        return model::offset::max();
    }
    auto lo = get_last_offset();
    if (_manifest->size() == 0 && lo == model::offset{0}) {
        lo = model::offset::min();
    }

    // Do not collect past the offset we last uploaded manifest for: this is
    // needed for correctness because the remote manifest is used in
    // handle_eviction() - it is what a remote node doing snapshot-driven
    // raft recovery will use to start from.
    lo = std::min(lo, _last_clean_at);

    return lo;
}

void archival_metadata_stm::maybe_notify_waiter(cluster::errc err) noexcept {
    if (_active_operation_res) {
        auto p = std::exchange(_active_operation_res, std::nullopt);
        p->set_value(err);
    }
}

void archival_metadata_stm::maybe_notify_waiter(std::exception_ptr e) noexcept {
    if (_active_operation_res) {
        auto p = std::exchange(_active_operation_res, std::nullopt);
        p->set_exception(e);
    }
}

void archival_metadata_stm::apply_add_segment(const segment& segment) {
    auto meta = segment.meta;
    bool disable_safe_add
      = config::shard_local_cfg()
          .cloud_storage_disable_metadata_consistency_checks.value();
    if (
      !disable_safe_add && segment.is_validated == segment_validated::yes
      && !_manifest->safe_segment_meta_to_add(meta)) {
        // We're only validating segment metadata records if they're validated.
        // It goes like this
        // - npt_archiver_service validates segment_meta instances before
        //   replication
        // - replicated add_segment commands have 'is_validated' field set to
        //   'yes'
        // - old records in the log have 'is_validated' field set to 'no'
        // - the 'apply_add_segment' will only validate new commands and add old
        //   ones unconditionally
        auto last = _manifest->last_segment();
        vlog(
          _logger.error,
          "Can't add segment: {}, previous segment: {}",
          meta,
          last);
        maybe_notify_waiter(errc::inconsistent_stm_update);
        return;
    }
    if (meta.ntp_revision == model::initial_revision_id{}) {
        // metadata serialized by old versions of redpanda doesn't have the
        // ntp_revision field.
        meta.ntp_revision = segment.ntp_revision_deprecated;
    }

    auto add_result = _manifest->add(segment.name, meta);
    if (
      add_result.has_value()
      && add_result->bytes_replaced_range > add_result->bytes_new_range) {
        _compacted_replaced_bytes += add_result->bytes_replaced_range
                                     - add_result->bytes_new_range;
    }

    vlog(
      _logger.debug,
      "Add segment command applied with {}, new start offset: {}, new last "
      "offset: {}, meta: {}",
      segment.name,
      get_start_offset(),
      get_last_offset(),
      segment.meta);

    if (meta.committed_offset > get_last_offset()) {
        if (meta.base_offset > model::next_offset(get_last_offset())) {
            // To ensure forward progress, we print a warning and skip over
            // the hole.

            vlog(
              _logger.error,
              "hole in the remote offset range detected! previous last "
              "offset: "
              "{}, new segment base offset: {}",
              get_last_offset(),
              meta.base_offset);
        }
    }
}

void archival_metadata_stm::apply_truncate(const start_offset& so) {
    auto removed = _manifest->truncate(so.start_offset);
    vlog(
      _logger.debug,
      "Truncate command applied, new start offset: {}, new last offset: {}",
      get_start_offset(),
      get_last_offset());
}

void archival_metadata_stm::apply_cleanup_metadata() {
    auto backlog = get_segments_to_cleanup();
    if (backlog.empty()) {
        return;
    }
    _manifest->delete_replaced_segments();
    _manifest->truncate();
    vlog(
      _logger.debug,
      "Cleanup metadata command applied, new start offset: {}, new last "
      "offset: {}",
      get_start_offset(),
      get_last_offset());
}

void archival_metadata_stm::apply_mark_clean(model::offset clean_offset) {
    _last_clean_at = clean_offset;
    vlog(
      _logger.debug,
      "Mark clean ({}) command applied, new start offset: {}, new last "
      "offset: {}",
      clean_offset,
      get_start_offset(),
      get_last_offset());
}

void archival_metadata_stm::apply_update_start_offset(const start_offset& so) {
    vlog(
      _logger.debug,
      "Updating start offset, current value {}, update {}",
      get_start_offset(),
      so.start_offset);
    if (!_manifest->advance_start_offset(so.start_offset)) {
        vlog(
          _logger.error,
          "Can't truncate manifest up to offset {}, offset out of range",
          so.start_offset);
    } else {
        vlog(_logger.debug, "Start offset updated to {}", get_start_offset());
    }
}

void archival_metadata_stm::apply_update_start_kafka_offset(kafka::offset so) {
    if (!_manifest->advance_start_kafka_offset(so)) {
        vlog(
          _logger.error,
          "Can't apply override to kafka start offset {}, currently {}",
          so,
          manifest().get_start_kafka_offset_override());
    }
}

void archival_metadata_stm::apply_reset_metadata() {
    vlog(_logger.info, "Resetting manifest");
    _manifest->unsafe_reset();
}

bool archival_metadata_stm::apply_read_write_fence(
  const archival_metadata_stm::read_write_fence_cmd& cmd) noexcept {
    if (_manifest->get_applied_offset() != cmd.last_applied_offset) {
        vlog(
          _logger.warn,
          "Concurrent modification error detected, current applied offset: {}, "
          "read-write fence: {}",
          _manifest->get_applied_offset(),
          cmd.last_applied_offset);
        maybe_notify_waiter(errc::concurrent_modification_error);
        return true;
    }
    return false;
}

void archival_metadata_stm::apply_update_highest_producer_id(
  model::producer_id pid) {
    if (_manifest->advance_highest_producer_id(pid)) {
        vlog(_logger.debug, "Updated highest producer ID to {}", pid());
    } else {
        vlog(
          _logger.debug,
          "Highest producer ID not updated: {} <= {}",
          pid(),
          _manifest->highest_producer_id());
    }
}

void archival_metadata_stm::apply_truncate_archive_init(
  const start_offset_with_delta& so) {
    vlog(
      _logger.debug,
      "Updating archive start offset, current value {}, update {}",
      get_archive_start_offset(),
      so.start_offset);
    _manifest->set_archive_start_offset(so.start_offset, so.delta);
}

void archival_metadata_stm::apply_truncate_archive_commit(
  model::offset co, uint64_t bytes_removed) {
    vlog(
      _logger.debug,
      "Updating archive clean offset, current value {}, update {}",
      get_archive_clean_offset(),
      co);
    _manifest->set_archive_clean_offset(co, bytes_removed);
}

void archival_metadata_stm::apply_spillover(const spillover_cmd& so) {
    if (_manifest->safe_spillover_manifest(so.manifest_meta)) {
        _manifest->spillover(so.manifest_meta);
        vlog(
          _logger.debug,
          "Spillover command applied, new start offset: {}, new last "
          "offset: "
          "{}",
          get_start_offset(),
          get_last_offset());
    } else {
        vlog(_logger.error, "Can't apply spillover_cmd: {}", so.manifest_meta);
    }
}

void archival_metadata_stm::apply_replace_manifest(iobuf val) {
    _manifest->from_iobuf(std::move(val));
    _compacted_replaced_bytes = 0;

    vlog(
      _logger.debug,
      "Replace command applied, new start offset: {}, new last offset: {}",
      get_start_offset(),
      get_last_offset());
}

void archival_metadata_stm::apply_process_anomalies(iobuf buf) {
    try {
        auto cmd = serde::from_iobuf<process_anomalies_cmd::value>(
          std::move(buf));
        vlog(_logger.debug, "Processing anomalies: {}", cmd.detected);
        _manifest->process_anomalies(
          cmd.scrub_timestamp,
          cmd.last_scrubbed_offset,
          cmd.status,
          std::move(cmd.detected));
    } catch (...) {
        vlog(
          _logger.error,
          "Failed to apply process anomalies command: {}",
          std::current_exception());
    }
}

void archival_metadata_stm::apply_reset_scrubbing_metadata() {
    vlog(_logger.info, "Resetting scrubbing metadata");
    _manifest->reset_scrubbing_metadata();
}

fragmented_vector<cloud_storage::partition_manifest::lw_segment_meta>
archival_metadata_stm::get_segments_to_cleanup() const {
    // Include replaced segments to the backlog
    using lw_segment_meta = cloud_storage::partition_manifest::lw_segment_meta;
    const auto source_backlog = _manifest->lw_replaced_segments();

    // Make sure that 'replaced' list doesn't have any references to active
    // segments. This is a protection from the data loss. This should not
    // happen, but protects us from data loss in cases where bugs elsewhere.
    const auto backlog_size = source_backlog.size();
    fragmented_vector<lw_segment_meta> backlog;
    std::copy_if(
      source_backlog.begin(),
      source_backlog.end(),
      std::back_inserter(backlog),
      [this](const lw_segment_meta& m) {
          auto it = _manifest->find(m.base_offset);
          if (it == _manifest->end()) {
              return true;
          }
          auto m_name = _manifest->generate_remote_segment_name(
            cloud_storage::partition_manifest::lw_segment_meta::convert(m));
          auto s_name = _manifest->generate_remote_segment_name(*it);
          // The segment will have the same path as the one we have in
          // manifest in S3 so if we will delete it the data will be lost.
          if (m_name == s_name) {
              vlog(
                _logger.error,
                "The replaced segment name {} collides with the segment "
                "{} "
                "in the manifest. It will be removed to prevent the data "
                "loss.",
                m_name,
                s_name);
              return false;
          }
          return true;
      });

    if (backlog.size() < backlog_size) {
        vlog(
          _logger.warn,
          "{} segments will not be removed from the bucket because they're "
          "available in the manifest",
          backlog_size - backlog.size());
    }

    auto so = _manifest->get_start_offset().value_or(model::offset(0));
    for (const auto& m : *_manifest) {
        if (m.committed_offset < so) {
            backlog.push_back(lw_segment_meta::convert(m));
        } else {
            break;
        }
    }
    return backlog;
}

ss::future<> archival_metadata_stm::stop() {
    _download_as.request_abort();
    co_await raft::persisted_stm<>::stop();
}

const cloud_storage::partition_manifest&
archival_metadata_stm::manifest() const {
    return *_manifest;
}

model::offset archival_metadata_stm::get_start_offset() const {
    auto p = _manifest->get_start_offset();
    if (p.has_value()) {
        return p.value();
    }
    return {};
}

model::offset archival_metadata_stm::get_last_offset() const {
    return _manifest->get_last_offset();
}

model::offset archival_metadata_stm::get_archive_start_offset() const {
    return _manifest->get_archive_start_offset();
}

model::offset archival_metadata_stm::get_archive_clean_offset() const {
    return _manifest->get_archive_clean_offset();
}

kafka::offset archival_metadata_stm::get_start_kafka_offset() const {
    return _manifest->get_start_kafka_offset().value_or(kafka::offset{});
}

/**
 * Dirty means "an upload to object store is required".
 * @param projected_clean
 * @return
 */
archival_metadata_stm::state_dirty archival_metadata_stm::get_dirty(
  std::optional<model::offset> projected_clean) const {
    // We are clean if we have written at least one clean record and that
    // clean record referred to an offset >= the last record that dirtied
    // the stm.
    if (projected_clean.has_value()) {
        return projected_clean.value() >= _last_dirty_at ? state_dirty::clean
                                                         : state_dirty::dirty;
    } else {
        return _last_clean_at >= model::offset{0}
                   && _last_clean_at >= _last_dirty_at
                 ? state_dirty::clean
                 : state_dirty::dirty;
    }
}

archival_metadata_stm_factory::archival_metadata_stm_factory(
  bool cloud_storage_enabled,
  ss::sharded<cloud_storage::remote>& cloud_storage_api,
  ss::sharded<features::feature_table>& feature_table,
  ss::sharded<cluster::topic_table>& topics)
  : _cloud_storage_enabled(cloud_storage_enabled)
  , _cloud_storage_api(cloud_storage_api)
  , _feature_table(feature_table)
  , _topics(topics) {}

bool archival_metadata_stm_factory::is_applicable_for(
  const storage::ntp_config& ntp_cfg) const {
    return _cloud_storage_enabled && _cloud_storage_api.local_is_initialized()
           && ntp_cfg.ntp().ns == model::kafka_namespace;
}

void archival_metadata_stm_factory::create(
  raft::state_machine_manager_builder& builder, raft::consensus* raft) {
    auto topic_md = _topics.local().get_topic_metadata_ref(
      model::topic_namespace_view(raft->ntp()));
    auto remote_label
      = topic_md.has_value()
          ? topic_md->get().get_configuration().properties.remote_label
          : std::nullopt;
    auto remote_topic_namespace_override
      = topic_md.has_value() ? topic_md->get()
                                 .get_configuration()
                                 .properties.remote_topic_namespace_override
                             : std::nullopt;
    auto stm = builder.create_stm<cluster::archival_metadata_stm>(
      raft,
      _cloud_storage_api.local(),
      _feature_table.local(),
      clusterlog,
      remote_label,
      remote_topic_namespace_override);
    raft->log()->stm_manager()->add_stm(stm);
}

} // namespace cluster
