// Copyright 2024 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package cli

import (
	"context"
	"encoding/json"
	"fmt"
	"net/url"
	"os"
	"path"
	"runtime"
	"sort"
	"strconv"
	"strings"
	"text/tabwriter"
	"time"

	"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/grafana"
	"github.com/cockroachdb/cockroach/pkg/roachprod"
	"github.com/cockroachdb/cockroach/pkg/roachprod/cloud"
	"github.com/cockroachdb/cockroach/pkg/roachprod/config"
	"github.com/cockroachdb/cockroach/pkg/roachprod/install"
	"github.com/cockroachdb/cockroach/pkg/roachprod/roachprodutil"
	"github.com/cockroachdb/cockroach/pkg/roachprod/ui"
	"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
	"github.com/cockroachdb/cockroach/pkg/roachprod/vm/gce"
	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
	"github.com/cockroachdb/errors"
	"github.com/fatih/color"
	"github.com/spf13/cobra"
	"golang.org/x/crypto/ssh"
	"golang.org/x/exp/maps"
	"golang.org/x/text/language"
	"golang.org/x/text/message"
)

const (
	AuthModeHelp = `
--auth-mode specifies the method of authentication unless --insecure is passed.
Defaults to root if not passed. Available auth-modes are:

	root: authenticates with the root user and root certificates

	user-password: authenticates with the default roachprod user and password

	user-cert: authenticates with the default roachprod user and certificates`

	tagHelp = `
The --tag flag can be used to to associate a tag with the process. This tag can
then be used to restrict the processes which are operated on by the status and
stop commands. Tags can have a hierarchical component by utilizing a slash
separated string similar to a filesystem path. A tag matches if a prefix of the
components match. For example, the tag "a/b" will match both "a/b" and
"a/b/c/d".
`
	cockroachApp = `
  cockroach  - Cockroach nightly builds. Can provide an optional SHA, otherwise
               latest build version is used.`
	releaseApp = `
  release    - Official CockroachDB Release. Must provide a specific release
               version.`
	customizedApp = `
  customized - Cockroach customized builds, usually generated by running
               ./scripts/tag-custom-build.sh. Must provide a specific tag.`
	localApp = `
  local      - Use a provided local binary, must provide the path to the binary.`
	workloadApp = `
  workload   - Cockroach workload application.`
)

var bashCompletion = os.ExpandEnv("$HOME/.roachprod/bash-completion.sh")

func (cr *commandRegistry) buildCreateCmd() *cobra.Command {
	createCmd := &cobra.Command{
		Use:   "create <cluster>",
		Short: "create a cluster",
		Long: `Create a local or cloud-based cluster.

A cluster is composed of a set of nodes, configured during cluster creation via
the --nodes flag. Creating a cluster does not start any processes on the nodes
other than the base system processes (e.g. sshd). See "roachprod start" for
starting cockroach nodes and "roachprod {run,ssh}" for running arbitrary
commands on the nodes of a cluster.

Cloud Clusters

  Cloud-based clusters are ephemeral and come with a lifetime (specified by the
  --lifetime flag) after which they will be automatically
  destroyed. Cloud-based clusters require the associated command line tool for
  the cloud to be installed and configured (e.g. "gcloud auth login").

  Clusters names are required to be prefixed by the authenticated user of the
  cloud service. The suffix is an arbitrary string used to distinguish
  clusters. For example, "marc-test" is a valid cluster name for the user
  "marc". The authenticated user for the cloud service is automatically
  detected and can be override by the ROACHPROD_USER environment variable or
  the --username flag.

  The machine type and the use of local SSD storage can be specified during
  cluster creation via the --{cloud}-machine-type and --local-ssd flags. The
  machine-type is cloud specified. For example, --gce-machine-type=n1-highcpu-8
  requests the "n1-highcpu-8" machine type for a GCE-based cluster. No attempt
  is made (or desired) to abstract machine types across cloud providers. See
  the cloud provider's documentation for details on the machine types
  available.

  The underlying filesystem can be provided using the --filesystem flag.
  Use --filesystem=zfs, for zfs, and --filesystem=ext4, for ext4. The default
  file system is ext4. The filesystem flag only works on gce currently.

Local Clusters

  A local cluster stores the per-node data in ${HOME}/local on the machine
  roachprod is being run on. Whether a cluster is local is specified on creation
  by using the name 'local' or 'local-<anything>'. Local clusters have no expiration.
`,
		Args: cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) (retErr error) {
			createVMOpts.ClusterName = args[0]
			opts := cloud.ClusterCreateOpts{Nodes: numNodes, CreateOpts: createVMOpts, ProviderOptsContainer: providerOptsContainer}
			return roachprod.Create(context.Background(), config.Logger, username, &opts)
		}),
	}
	cr.addToExcludeFromBashCompletion(createCmd)
	cr.addToExcludeFromClusterFlagsMulti(createCmd)
	initCreateCmdFlags(createCmd)
	initFlagUsernameForCmd(createCmd)
	return createCmd
}

func (cr *commandRegistry) buildGrowCmd() *cobra.Command {
	growCmd := &cobra.Command{
		Use:   `grow <cluster> <num-nodes>`,
		Short: `grow a cluster by adding nodes`,
		Long: `grow a cluster by adding the specified number of nodes to it.

Only Google Cloud and local clusters currently support adding nodes. The Google
Cloud cluster has to be a managed cluster (i.e., a cluster created with the
gce-managed flag). The new nodes will use the instance template that was used to
create the cluster originally (Nodes will be created in the same zone as the
existing nodes, or if the cluster is geographically distributed, the nodes will
be fairly distributed across the zones of the cluster).
`,
		Args: cobra.ExactArgs(2),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			count, err := strconv.ParseInt(args[1], 10, 8)
			if err != nil || count < 1 {
				return errors.Wrapf(err, "invalid num-nodes argument")
			}
			return roachprod.Grow(context.Background(), config.Logger, args[0], isSecure, int(count))
		}),
	}
	initFlagInsecureForCmd(growCmd)
	return growCmd
}

func (cr *commandRegistry) buildShrinkCmd() *cobra.Command {
	return &cobra.Command{
		Use:   `shrink <cluster> <num-nodes>`,
		Short: `shrink a cluster by removing nodes`,
		Long: `shrink a cluster by removing the specified number of nodes.

Only Google Cloud and local clusters currently support removing nodes. The
Google Cloud cluster has to be a managed cluster (i.e., a cluster created with
the gce-managed flag). Nodes are removed from the tail end of the cluster.
Removing nodes from the middle of the cluster is not supported yet.
`,
		Args: cobra.ExactArgs(2),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			count, err := strconv.ParseInt(args[1], 10, 8)
			if err != nil || count < 1 {
				return errors.Wrapf(err, "invalid num-nodes argument")
			}
			return roachprod.Shrink(context.Background(), config.Logger, args[0], int(count))
		}),
	}
}

func (cr *commandRegistry) buildResetCmd() *cobra.Command {
	return &cobra.Command{
		Use:   "reset <cluster>",
		Short: "reset *all* VMs in a cluster",
		Long:  `Reset a cloud VM.`,
		Args:  cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) (retErr error) {
			return roachprod.Reset(config.Logger, args[0])
		}),
	}
}

func (cr *commandRegistry) buildDestroyCmd() *cobra.Command {
	destroyCmd := &cobra.Command{
		Use:   "destroy [ --all-mine | --all-local | <cluster 1> [<cluster 2> ...] ]",
		Short: "destroy clusters",
		Long: `Destroy one or more local or cloud-based clusters.

The destroy command accepts the names of the clusters to destroy. Alternatively,
the --all-mine flag can be provided to destroy all (non-local) clusters that are
owned by the current user, or the --all-local flag can be provided to destroy
all local clusters.

Destroying a cluster releases the resources for a cluster. For a cloud-based
cluster the machine and associated disk resources are freed. For a local
cluster, any processes started by roachprod are stopped, and the node
directories inside ${HOME}/local directory are removed.
`,
		Args: cobra.ArbitraryArgs,
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			return roachprod.Destroy(config.Logger, username, destroyAllMine, destroyAllLocal, args...)
		}),
	}
	initDestroyCmdFlags(destroyCmd)
	initFlagUsernameForCmd(destroyCmd)
	return destroyCmd
}

func (cr *commandRegistry) buildExtendCmd() *cobra.Command {
	extendCmd := &cobra.Command{
		Use:   "extend <cluster>",
		Short: "extend the lifetime of a cluster",
		Long: `Extend the lifetime of the specified cluster to prevent it from being
destroyed:

  roachprod extend marc-test --lifetime=6h
`,
		Args: cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			return roachprod.Extend(config.Logger, args[0], extendLifetime)
		}),
	}
	extendCmd.Flags().DurationVarP(&extendLifetime,
		"lifetime", "l", 12*time.Hour, "Lifetime of the cluster")
	return extendCmd
}

func (cr *commandRegistry) buildLoadBalancerCmd() *cobra.Command {
	loadBalancerCmd := &cobra.Command{
		Use:   "load-balancer [command]",
		Short: "manage and query load balancers",
		Long:  `create load balancers for specific services, query the IP or postgres URL of a load balancer`,
		Args:  cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			return roachprod.CreateLoadBalancer(context.Background(), config.Logger,
				args[0], isSecure, virtualClusterName, sqlInstance,
			)
		}),
	}
	loadBalancerCmd.AddCommand(
		buildCreateLoadBalancerCmd(),
		buildLoadBalancerPGUrl(),
		buildLoadBalancerIP(),
	)
	return loadBalancerCmd
}

func buildCreateLoadBalancerCmd() *cobra.Command {
	createLoadBalancerCmd := &cobra.Command{
		Use:   "create <cluster>",
		Short: "create a load balancer for a cluster",
		Long: `Create a load balancer for a specific service (port), system by default, for the given cluster.

The load balancer is created using the cloud provider's load balancer service.
Currently only Google Cloud is supported, and the cluster must have been created
with the --gce-managed flag. On Google Cloud a load balancer consists of various
components that include backend services, health checks and forwarding rules.
These resources will automatically be destroyed when the cluster is destroyed.
`,

		Args: cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			return roachprod.CreateLoadBalancer(context.Background(), config.Logger,
				args[0], isSecure, virtualClusterName, sqlInstance,
			)
		}),
	}
	initFlagInsecureForCmd(createLoadBalancerCmd)
	initFlagsClusterNSQLForCmd(createLoadBalancerCmd)
	return createLoadBalancerCmd
}

func buildLoadBalancerPGUrl() *cobra.Command {
	loadBalancerPGUrl := &cobra.Command{
		Use:   "pgurl <cluster>",
		Short: "get the postgres URL of a load balancer",
		Long: fmt.Sprintf(`Get the postgres URL of a load balancer.
%[1]s`, strings.TrimSpace(AuthModeHelp)),
		Args: cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			auth, err := install.ResolveAuthMode(authMode)
			if err != nil {
				return err
			}
			url, err := roachprod.LoadBalancerPgURL(context.Background(), config.Logger, args[0], pgurlCertsDir, roachprod.PGURLOptions{
				Database:           database,
				External:           external,
				Secure:             isSecure,
				VirtualClusterName: virtualClusterName,
				SQLInstance:        sqlInstance,
				Auth:               auth,
			})
			if err != nil {
				return err
			}
			fmt.Println(url)
			return nil
		}),
	}
	initFlagPgurlCertsDirForCmd(loadBalancerPGUrl)
	initFlagAuthModeNDatabaseForCmd(loadBalancerPGUrl)
	initFlagInsecureForCmd(loadBalancerPGUrl)
	initFlagsClusterNSQLForCmd(loadBalancerPGUrl)
	return loadBalancerPGUrl
}

func buildLoadBalancerIP() *cobra.Command {
	loadBalancerIP := &cobra.Command{
		Use:   "ip <cluster>",
		Short: "get the IP address of a load balancer",
		Long:  "Get the IP address of a load balancer.",
		Args:  cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			ip, err := roachprod.LoadBalancerIP(context.Background(), config.Logger, args[0], virtualClusterName, sqlInstance)
			if err != nil {
				return err
			}
			fmt.Println(ip)
			return nil
		}),
	}
	initFlagsClusterNSQLForCmd(loadBalancerIP)
	return loadBalancerIP
}

func (cr *commandRegistry) buildListCmd() *cobra.Command {
	listCmd := &cobra.Command{
		Use:   "list [--details | --json] [ --mine | --pattern ]",
		Short: "list all clusters",
		Long: `List all clusters.

The list command accepts a flag --pattern which is a regular
expression that will be matched against the cluster name pattern.  Alternatively,
the --mine flag can be provided to list the clusters that are owned by the current
user.

The default output shows one line per cluster, including the local cluster if
it exists:

  ~ roachprod list
  local:     [local]    1  (-)
  marc-test: [aws gce]  4  (5h34m35s)
  Syncing...

The second column lists the cloud providers that host VMs for the cluster.

The third and fourth columns are the number of nodes in the cluster and the
time remaining before the cluster will be automatically destroyed. Note that
local clusters do not have an expiration.

The --details flag adjusts the output format to include per-node details:

  ~ roachprod list --details
  local [local]: (no expiration)
    localhost		127.0.0.1	127.0.0.1
  marc-test: [aws gce] 5h33m57s remaining
    marc-test-0001	marc-test-0001.us-east1-b.cockroach-ephemeral	10.142.0.18	35.229.60.91
    marc-test-0002	marc-test-0002.us-east1-b.cockroach-ephemeral	10.142.0.17	35.231.0.44
    marc-test-0003	marc-test-0003.us-east1-b.cockroach-ephemeral	10.142.0.19	35.229.111.100
    marc-test-0004	marc-test-0004.us-east1-b.cockroach-ephemeral	10.142.0.20	35.231.102.125
  Syncing...

The first and second column are the node hostname and fully qualified name
respectively. The third and fourth column are the private and public IP
addresses.

The --json flag sets the format of the command output to json.

Listing clusters has the side-effect of syncing ssh keys/configs and the local
hosts file.
`,
		Args: cobra.NoArgs,
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			if listJSON && listDetails {
				return errors.New("'json' option cannot be combined with 'details' option")
			}
			filteredCloud, err := roachprod.List(config.Logger, listMine, listPattern,
				vm.ListOptions{
					Username:             username,
					ComputeEstimatedCost: listCost,
				})

			if err != nil {
				return err
			}

			// sort by cluster names for stable output.
			names := make([]string, len(filteredCloud.Clusters))
			maxClusterName := 0
			i := 0
			for name := range filteredCloud.Clusters {
				names[i] = name
				if len(name) > maxClusterName {
					maxClusterName = len(name)
				}
				i++
			}
			sort.Strings(names)

			p := message.NewPrinter(language.English)
			if listJSON {
				enc := json.NewEncoder(os.Stdout)
				enc.SetIndent("", "  ")
				if err := enc.Encode(filteredCloud); err != nil {
					return err
				}
			} else {
				machineType := func(clusterVMs vm.List) string {
					return clusterVMs[0].MachineType
				}
				cpuArch := func(clusterVMs vm.List) string {
					// Display CPU architecture and family.
					if clusterVMs[0].CPUArch == "" {
						// N.B. Either a local cluster or unsupported cloud provider.
						return ""
					}
					if clusterVMs[0].CPUFamily != "" {
						return clusterVMs[0].CPUFamily
					}
					if clusterVMs[0].CPUArch != vm.ArchAMD64 {
						return string(clusterVMs[0].CPUArch)
					}
					// AMD64 is the default, so don't display it.
					return ""
				}
				// Align columns right and separate with at least two spaces.
				tw := tabwriter.NewWriter(os.Stdout, 0, 8, 2, ' ', tabwriter.AlignRight)
				// N.B. colors use escape codes which don't play nice with tabwriter [1].
				// We use a hacky workaround below to color the empty string.
				// [1] https://github.com/golang/go/issues/12073

				if !listDetails {
					// Print header only if we are not printing cluster details.
					fmt.Fprintf(tw, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\n",
						"Cluster", "Clouds", "Size", "VM", "Arch",
						color.HiWhiteString("$/hour"), color.HiWhiteString("$ Spent"),
						color.HiWhiteString("Uptime"), color.HiWhiteString("TTL"),
						color.HiWhiteString("$/TTL"))
					// Print separator.
					fmt.Fprintf(tw, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\n",
						"", "", "", "",
						color.HiWhiteString(""), color.HiWhiteString(""),
						color.HiWhiteString(""), color.HiWhiteString(""),
						color.HiWhiteString(""))
				}
				totalCostPerHour := 0.0
				for _, name := range names {
					c := filteredCloud.Clusters[name]
					if listDetails {
						if err = c.PrintDetails(config.Logger); err != nil {
							return err
						}
					} else {
						// N.B. Tabwriter doesn't support per-column alignment. It looks odd to have the cluster names right-aligned,
						// so we make it left-aligned.
						fmt.Fprintf(tw, "%s\t%s\t%d\t%s\t%s", name+strings.Repeat(" ", maxClusterName-len(name)), c.Clouds(),
							len(c.VMs), machineType(c.VMs), cpuArch(c.VMs))
						if !c.IsLocal() {
							colorByCostBucket := func(cost float64) func(string, ...interface{}) string {
								switch {
								case cost <= 100:
									return color.HiGreenString
								case cost <= 1000:
									return color.HiBlueString
								default:
									return color.HiRedString
								}
							}
							timeRemaining := c.LifetimeRemaining().Round(time.Second)
							formatTTL := func(ttl time.Duration) string {
								if c.VMs[0].Preemptible {
									return color.HiMagentaString(ttl.String())
								} else {
									return color.HiBlueString(ttl.String())
								}
							}
							cost := c.CostPerHour
							totalCostPerHour += cost
							alive := timeutil.Since(c.CreatedAt).Round(time.Minute)
							costSinceCreation := cost * float64(alive) / float64(time.Hour)
							costRemaining := cost * float64(timeRemaining) / float64(time.Hour)
							if cost > 0 {
								fmt.Fprintf(tw, "\t%s\t%s\t%s\t%s\t%s\t",
									color.HiGreenString(p.Sprintf("$%.2f", cost)),
									colorByCostBucket(costSinceCreation)(p.Sprintf("$%.2f", costSinceCreation)),
									color.HiWhiteString(alive.String()),
									formatTTL(timeRemaining),
									colorByCostBucket(costRemaining)(p.Sprintf("$%.2f", costRemaining)))
							} else {
								fmt.Fprintf(tw, "\t%s\t%s\t%s\t%s\t%s\t",
									color.HiGreenString(""),
									color.HiGreenString(""),
									color.HiWhiteString(alive.String()),
									formatTTL(timeRemaining),
									color.HiGreenString(""))
							}
						} else {
							fmt.Fprintf(tw, "\t(-)")
						}
						fmt.Fprintf(tw, "\n")
					}
				}
				if err := tw.Flush(); err != nil {
					return err
				}

				if totalCostPerHour > 0 {
					_, _ = p.Printf("\nTotal cost per hour: $%.2f\n", totalCostPerHour)
				}

				// Optionally print any dangling instances with errors
				if listDetails {
					collated := filteredCloud.BadInstanceErrors()

					// Sort by Error() value for stable output
					var errors ui.ErrorsByError
					for err := range collated {
						errors = append(errors, err)
					}
					sort.Sort(errors)

					for _, e := range errors {
						fmt.Printf("%s: %s\n", e, collated[e].Names())
					}
				}
			}
			return nil
		}),
	}
	cr.addToExcludeFromBashCompletion(listCmd)
	initListCmdFlags(listCmd)
	initFlagUsernameForCmd(listCmd)
	initFlagDNSRequiredProvidersForCmd(listCmd)
	return listCmd
}

// TODO(peter): Do we need this command given that the "list" command syncs as
// a side-effect. If you don't care about the list output, just "roachprod list
// &>/dev/null".
func (cr *commandRegistry) buildSyncCmd() *cobra.Command {
	syncCmd := &cobra.Command{
		Use:   "sync [flags]",
		Short: "sync ssh keys/config and hosts files",
		Long:  ``,
		Args:  cobra.NoArgs,
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			_, err := roachprod.Sync(config.Logger, listOpts)
			_ = cr.rootCmd.GenBashCompletionFile(bashCompletion)
			return err
		}),
	}
	cr.addToExcludeFromBashCompletion(syncCmd)
	initSyncCmdFlags(syncCmd)
	initFlagDNSRequiredProvidersForCmd(syncCmd)
	return syncCmd
}

func (cr *commandRegistry) buildGCCmd() *cobra.Command {
	gcCmd := &cobra.Command{
		Use:   "gc",
		Short: "GC expired clusters and unused AWS keypairs\n",
		Long: `Garbage collect expired clusters and unused SSH keypairs in AWS.

Destroys expired clusters, sending email if properly configured. Usually run
hourly by a cronjob so it is not necessary to run manually.
`,
		Args: cobra.NoArgs,
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			return roachprod.GC(config.Logger, dryrun)
		}),
	}
	cr.addToExcludeFromBashCompletion(gcCmd)
	initGCCmdFlags(gcCmd)
	return gcCmd
}

func (cr *commandRegistry) buildSetupSSHCmd() *cobra.Command {
	return &cobra.Command{
		Use:   "setup-ssh <cluster>",
		Short: "set up ssh for a cluster",
		Long: `Sets up the keys and host keys for the vms in the cluster.

It first resets the machine credentials as though the cluster were newly created
using the cloud provider APIs and then proceeds to ensure that the hosts can
SSH into eachother and lastly adds additional public keys to AWS hosts as read
from the GCP project. This operation is performed as the last step of creating
a new cluster but can be useful to re-run if the operation failed previously or
if the user would like to update the keys on the remote hosts.
`,

		Args: cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) (retErr error) {
			return roachprod.SetupSSH(context.Background(), config.Logger, args[0])
		}),
	}
}

func (cr *commandRegistry) buildStatusCmd() *cobra.Command {
	statusCmd := &cobra.Command{
		Use:   "status <cluster>",
		Short: "retrieve the status of nodes in a cluster",
		Long: `Retrieve the status of nodes in a cluster.

The "status" command outputs the binary and PID for the specified nodes:

  ~ roachprod status local
  local: status 3/3
     1: cockroach 29688
     2: cockroach 29687
     3: cockroach 29689
` + tagHelp + `
`,
		Args: cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			statuses, err := roachprod.Status(context.Background(), config.Logger, args[0], tag)
			if err != nil {
				return err
			}
			for _, status := range statuses {
				if status.Err != nil {
					config.Logger.Printf("  %2d: %s %s\n", status.NodeID, status.Err.Error())
				} else if !status.Running {
					// TODO(irfansharif): Surface the staged version here?
					config.Logger.Printf("  %2d: not running\n", status.NodeID)
				} else {
					config.Logger.Printf("  %2d: %s %s\n", status.NodeID, status.Version, status.Pid)
				}
			}
			return nil
		}),
	}
	addHelpAboutNodes(statusCmd)
	initFlagInsecureIgnoreHostKeyForCmd(statusCmd)
	initFlagTagForCmd(statusCmd)
	return statusCmd
}

func (cr *commandRegistry) buildMonitorCmd() *cobra.Command {
	monitorCmd := &cobra.Command{
		Use:   "monitor",
		Short: "monitor the status of nodes in a cluster",
		Long: `Monitor the status of cockroach nodes in a cluster.

The "monitor" command runs until terminated. At startup it outputs a line for
each specified node indicating the status of the node (either the PID of the
node if alive, or "dead" otherwise). It then watches for changes in the status
of nodes, outputting a line whenever a change is detected:

  ~ roachprod monitor local
  1: 29688
  3: 29689
  2: 29687
  3: dead
  3: 30718
`,
		Args: cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			eventChan, err := roachprod.Monitor(context.Background(), config.Logger, args[0], monitorOpts)
			if err != nil {
				return err
			}
			for info := range eventChan {
				fmt.Println(info.String())
			}

			return nil
		}),
	}
	initMonitorCmdFlags(monitorCmd)
	initFlagInsecureIgnoreHostKeyForCmd(monitorCmd)
	return monitorCmd
}

func (cr *commandRegistry) buildStartCmd() *cobra.Command {
	startCmd := &cobra.Command{
		Use:   "start <cluster>",
		Short: "start nodes on a cluster",
		Long: `Start nodes on a cluster.

Nodes are started in secure mode by default and there is a one time
initialization for the cluster to create and distribute the certs.
Note that running some modes in secure mode and others in insecure
mode is not a supported Cockroach configuration. To start nodes in
insecure mode, use the --insecure flag.

The --binary flag specifies the remote binary to run. It is up to the roachprod
user to ensure this binary exists, usually via "roachprod put". Note that no
cockroach software is installed by default on a newly created cluster.

The --args and --env flags can be used to pass arbitrary command line flags and
environment variables to the cockroach process.
` + tagHelp + `
The "start" command takes care of setting up the --join address and specifying
reasonable defaults for other flags. One side-effect of this convenience is
that node 1 is special and if started, is used to auto-initialize the cluster.
The --skip-init flag can be used to avoid auto-initialization (which can then
separately be done using the "init" command).

If the COCKROACH_DEV_LICENSE environment variable is set the enterprise.license
cluster setting will be set to its value.
`,
		Args: cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			clusterSettingsOpts := []install.ClusterSettingOption{
				install.TagOption(tag),
				install.PGUrlCertsDirOption(pgurlCertsDir),
				install.SecureOption(isSecure),
				install.UseTreeDistOption(useTreeDist),
				install.EnvOption(nodeEnv),
				install.NumRacksOption(numRacks),
			}
			return roachprod.Start(context.Background(), config.Logger, args[0], startOpts, clusterSettingsOpts...)
		}),
	}
	addHelpAboutNodes(startCmd)
	initStartCmdFlags(startCmd)
	initFlagsStartOpsForCmd(startCmd)
	initFlagInsecureIgnoreHostKeyForCmd(startCmd)
	initFlagTagForCmd(startCmd)
	initFlagSCPForCmd(startCmd)
	initFlagBinaryForCmd(startCmd)
	initFlagInsecureForCmd(startCmd)
	initFlagDNSRequiredProvidersForCmd(startCmd)
	return startCmd
}

func (cr *commandRegistry) buildUpdateTargetsCmd() *cobra.Command {
	updateTargetsCmd := &cobra.Command{
		Use:   "update-targets <cluster>",
		Short: "update prometheus target configurations for a cluster",
		Long: `Update prometheus target configurations of each node of a cluster.

The "start" command updates the prometheus target configuration every time. But, in case of any
failure, this command can be used to update the configurations.

The default prometheus url is https://grafana.testeng.crdb.io/. This can be overwritten by using the
environment variable COCKROACH_PROM_HOST_URL

Note that if the cluster is started in insecure mode, set the insecure mode here as well by using the --insecure flag.
`,
		Args: cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			clusterSettingsOpts := []install.ClusterSettingOption{
				install.SecureOption(isSecure),
			}
			return roachprod.UpdateTargets(context.Background(), config.Logger, args[0], clusterSettingsOpts...)
		}),
	}
	initFlagInsecureForCmd(updateTargetsCmd)
	return updateTargetsCmd
}

func (cr *commandRegistry) buildStopCmd() *cobra.Command {
	stopCmd := &cobra.Command{
		Use:   "stop <cluster> [--sig] [--wait]",
		Short: "stop nodes on a cluster",
		Long: `Stop nodes on a cluster.

Stop roachprod created processes running on the nodes in a cluster, including
processes started by the "start", "run" and "ssh" commands. Every process
started by roachprod is tagged with a ROACHPROD environment variable which is
used by "stop" to locate the processes and terminate them. By default processes
are killed with signal 9 (SIGKILL) giving them no chance for a graceful exit.

The --sig flag will pass a signal to kill to allow us finer control over how we
shutdown cockroach. The --wait flag causes stop to loop waiting for all
processes with the right ROACHPROD environment variable to exit. Note that stop
will wait forever if you specify --wait with a non-terminating signal (e.g.
SIGHUP), unless you also configure --max-wait.

--wait defaults to true for signal 9 (SIGKILL) and false for all other signals.
` + tagHelp + `
`,
		Args: cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			wait := waitFlag
			if sig == 9 /* SIGKILL */ && !cmd.Flags().Changed("wait") {
				wait = true
			}
			stopOpts := roachprod.StopOpts{Wait: wait, GracePeriod: gracePeriod, ProcessTag: tag, Sig: sig}
			return roachprod.Stop(context.Background(), config.Logger, args[0], stopOpts)
		}),
	}
	addHelpAboutNodes(stopCmd)
	initFlagsStopProcessForCmd(stopCmd, &sig, &waitFlag, &gracePeriod)
	initFlagInsecureIgnoreHostKeyForCmd(stopCmd)
	initFlagTagForCmd(stopCmd)
	return stopCmd
}

func (cr *commandRegistry) buildStartInstanceCmd() *cobra.Command {
	startInstanceCmd := &cobra.Command{
		Use:   "start-sql <name> --storage-cluster <storage-cluster> [--external-nodes <virtual-cluster-nodes>]",
		Short: "start the SQL/HTTP service for a virtual cluster as a separate process",
		Long: `Start SQL/HTTP instances for a virtual cluster as separate processes.

The --storage-cluster flag must be used to specify a storage cluster
(with optional node selector) which is already running. The command
will create the virtual cluster on the storage cluster if it does not
exist already.  If creating multiple virtual clusters on the same
node, the --sql-instance flag must be passed to differentiate them.

The instance is started in shared process (in memory) mode by
default. To start an external process instance, pass the
--external-cluster flag indicating where the SQL server processes
should be started.

Nodes are started in secure mode by default and there is a one time
initialization for the cluster to create and distribute the certs.
Note that running some modes in secure mode and others in insecure
mode is not a supported Cockroach configuration. To start nodes in
insecure mode, use the --insecure flag.

The --binary flag specifies the remote binary to run, if starting
external services. It is up to the roachprod user to ensure this
binary exists, usually via "roachprod put". Note that no cockroach
software is installed by default on a newly created cluster.

The --args and --env flags can be used to pass arbitrary command line flags and
environment variables to the cockroach process.
` + tagHelp + `
`,
		Args: cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			clusterSettingsOpts := []install.ClusterSettingOption{
				install.TagOption(tag),
				install.PGUrlCertsDirOption(pgurlCertsDir),
				install.SecureOption(isSecure),
				install.UseTreeDistOption(useTreeDist),
				install.EnvOption(nodeEnv),
				install.NumRacksOption(numRacks),
			}

			startOpts.Target = install.StartSharedProcessForVirtualCluster
			// If the user passed an `--external-nodes` option, we are
			// starting a separate process virtual cluster.
			if startOpts.VirtualClusterLocation != "" {
				startOpts.Target = install.StartServiceForVirtualCluster
			}

			startOpts.VirtualClusterName = args[0]
			return roachprod.StartServiceForVirtualCluster(
				context.Background(), config.Logger, storageCluster, startOpts, clusterSettingsOpts...,
			)
		}),
	}
	initStartInstanceCmdFlags(startInstanceCmd)
	initFlagsStartOpsForCmd(startInstanceCmd)
	initFlagTagForCmd(startInstanceCmd)
	initFlagBinaryForCmd(startInstanceCmd)
	initFlagInsecureForCmd(startInstanceCmd)
	return startInstanceCmd
}

func (cr *commandRegistry) buildStopInstanceCmd() *cobra.Command {
	stopInstanceCmd := &cobra.Command{
		Use:   "stop-sql <cluster> --cluster <name> --sql-instance <instance> [--sig] [--wait]",
		Short: "stop sql instances on a cluster",
		Long: `Stop sql instances on a cluster.

Stop roachprod created virtual clusters (shared or separate process). By default,
separate processes are killed with signal 9 (SIGKILL) giving them no chance for a
graceful exit.

The --sig flag will pass a signal to kill to allow us finer control over how we
shutdown processes. The --wait flag causes stop to loop waiting for all
processes to exit. Note that stop will wait forever if you specify --wait with a
non-terminating signal (e.g. SIGHUP), unless you also configure --max-wait.

--wait defaults to true for signal 9 (SIGKILL) and false for all other signals.
`,
		Args: cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			wait := waitFlag
			if sig == 9 /* SIGKILL */ && !cmd.Flags().Changed("wait") {
				wait = true
			}
			stopOpts := roachprod.StopOpts{
				Wait:               wait,
				GracePeriod:        gracePeriod,
				Sig:                sig,
				VirtualClusterName: virtualClusterName,
				SQLInstance:        sqlInstance,
			}
			clusterName := args[0]
			return roachprod.StopServiceForVirtualCluster(context.Background(), config.Logger, clusterName, isSecure, stopOpts)
		}),
	}
	initFlagsStopProcessForCmd(stopInstanceCmd, &sig, &waitFlag, &gracePeriod)
	initFlagInsecureForCmd(stopInstanceCmd)
	initFlagsClusterNSQLForCmd(stopInstanceCmd)
	return stopInstanceCmd
}

func (cr *commandRegistry) buildDeployCmd() *cobra.Command {
	deployCmd := &cobra.Command{
		Use:   "deploy <cluster> <application> <version>|<pathToBinary>",
		Short: "deploy a new version of cockroach",
		Long: fmt.Sprintf(`Performs a rolling upgrade of cockroach.

The deploy command currently only supports redeploying the storage cluster
(system tenant). It should be run on a cluster that is already running
cockroach. The command will download the specified version of cockroach and
stage it on the cluster. It will then perform a rolling upgrade of the cluster,
one node at a time, to the new version.

Currently available application options are:
  %s`, strings.TrimSpace(cockroachApp+releaseApp+customizedApp+localApp)),
		Args: cobra.RangeArgs(2, 3),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			versionArg := ""
			pathToBinary := ""
			if args[1] == "local" {
				if len(args) < 3 {
					return errors.New("local application requires a path to the binary: deploy <cluster> local <pathToBinary>")
				}
				pathToBinary = args[2]
			} else if len(args) == 3 {
				versionArg = args[2]
			}
			return roachprod.Deploy(context.Background(), config.Logger, args[0], args[1],
				versionArg, pathToBinary, pause, deploySig, deployWaitFlag, deployGracePeriod, secure)
		}),
	}
	deployCmd.Flags().DurationVar(&pause, "pause", pause, "duration to pause between node restarts")
	initFlagsStopProcessForCmd(deployCmd, &deploySig, &deployWaitFlag, &deployGracePeriod)
	return deployCmd
}

func (cr *commandRegistry) buildInitCmd() *cobra.Command {
	initCmd := &cobra.Command{
		Use:   "init <cluster>",
		Short: "initialize the cluster",
		Long: `Initialize the cluster.

The "init" command bootstraps the cluster (using "cockroach init"). It also sets
default cluster settings. It's intended to be used in conjunction with
'roachprod start --skip-init'.
`,
		Args: cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			return roachprod.Init(context.Background(), config.Logger, args[0], startOpts)
		}),
	}
	initCmd.Flags().IntVar(&startOpts.InitTarget,
		"init-target", startOpts.InitTarget, "node on which to run initialization")
	return initCmd
}

func (cr *commandRegistry) buildRunCmd() *cobra.Command {
	runCmd := &cobra.Command{
		Use:     "run <cluster> <command> [args]",
		Aliases: []string{"ssh"},
		Short:   "run a command on the nodes in a cluster",
		Long: `Run a command on the nodes in a cluster.
`,
		Args: cobra.MinimumNArgs(1),
		Run: wrap(func(_ *cobra.Command, args []string) error {
			return roachprod.Run(context.Background(), config.Logger, args[0], extraSSHOptions, tag,
				isSecure, os.Stdout, os.Stderr, args[1:], install.RunOptions{FailOption: install.FailSlow})
		}),
	}
	addHelpAboutNodes(runCmd)
	runCmd.Flags().StringVarP(&extraSSHOptions,
		"ssh-options", "O", "", "extra args to pass to ssh")
	initFlagInsecureIgnoreHostKeyForCmd(runCmd)
	initFlagTagForCmd(runCmd)
	initFlagInsecureForCmd(runCmd)
	return runCmd
}

func (cr *commandRegistry) buildSignalCmd() *cobra.Command {
	signalCmd := &cobra.Command{
		Use:   "signal <cluster> <signal>",
		Short: "send signal to cluster",
		Long:  "Send a POSIX signal, specified by its integer code, to every process started via roachprod in a cluster.",
		Args:  cobra.ExactArgs(2),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			sig, err := strconv.ParseInt(args[1], 10, 8)
			if err != nil {
				return errors.Wrapf(err, "invalid signal argument")
			}
			return roachprod.Signal(context.Background(), config.Logger, args[0], int(sig))
		}),
	}
	addHelpAboutNodes(signalCmd)
	return signalCmd
}

func (cr *commandRegistry) buildWipeCmd() *cobra.Command {
	wipeCmd := &cobra.Command{
		Use:   "wipe <cluster>",
		Short: "wipe a cluster",
		Long: `Wipe the nodes in a cluster.

The "wipe" command first stops any processes running on the nodes in a cluster
(via the "stop" command) and then deletes the data directories used by the
nodes.
`,
		Args: cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			return roachprod.Wipe(context.Background(), config.Logger, args[0], wipePreserveCerts)
		}),
	}
	addHelpAboutNodes(wipeCmd)
	wipeCmd.Flags().BoolVar(&wipePreserveCerts, "preserve-certs", false, "do not wipe certificates")
	initFlagInsecureIgnoreHostKeyForCmd(wipeCmd)
	return wipeCmd
}

func (cr *commandRegistry) buildDestroyDNSCmd() *cobra.Command {
	return &cobra.Command{
		Use:   `destroy-dns <cluster>`,
		Short: `cleans up DNS entries for the cluster`,
		Args:  cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			return roachprod.DestroyDNS(context.Background(), config.Logger, args[0])
		}),
	}
}

func (cr *commandRegistry) buildReformatCmd() *cobra.Command {
	reformatCmd := &cobra.Command{
		Use:   "reformat <cluster> <filesystem>",
		Short: "reformat disks in a cluster\n",
		Long: `
Reformat disks in a cluster to use the specified filesystem.

WARNING: Reformatting will delete all existing data in the cluster.

Filesystem options:
  ext4
  zfs

When running with ZFS, you can create a snapshot of the filesystem's current
state using the 'zfs snapshot' command:

  $ roachprod run <cluster> 'sudo zfs snapshot data1@pristine'

You can then nearly instantaneously restore the filesystem to this state with
the 'zfs rollback' command:

  $ roachprod run <cluster> 'sudo zfs rollback data1@pristine'

`,

		Args: cobra.ExactArgs(2),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			return roachprod.Reformat(context.Background(), config.Logger, args[0], args[1])
		}),
	}
	initFlagInsecureIgnoreHostKeyForCmd(reformatCmd)
	return reformatCmd
}

func (cr *commandRegistry) buildInstallCmd() *cobra.Command {
	installCmd := &cobra.Command{
		Use:   "install <cluster> <software>",
		Short: "install 3rd party software",
		Long: `Install third party software. Currently available installation options are:

    ` + strings.Join(install.SortedCmds(), "\n    ") + `
`,
		Args: cobra.MinimumNArgs(2),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			return roachprod.Install(context.Background(), config.Logger, args[0], args[1:])
		}),
	}
	addHelpAboutNodes(installCmd)
	initFlagInsecureIgnoreHostKeyForCmd(installCmd)
	return installCmd
}

func (cr *commandRegistry) buildDistributeCertsCmd() *cobra.Command {
	return &cobra.Command{
		Use:   "distribute-certs <cluster>",
		Short: "distribute certificates to the nodes in a cluster",
		Long: `Distribute certificates to the nodes in a cluster.
If the certificates already exist, no action is taken. Note that this command is
invoked automatically when a secure cluster is bootstrapped by "roachprod
start."
`,
		Args: cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			return roachprod.DistributeCerts(context.Background(), config.Logger, args[0])
		}),
	}
}

func (cr *commandRegistry) buildSshKeysCmd() *cobra.Command {
	sshKeysCmd := &cobra.Command{
		Use:   "ssh-keys",
		Short: "manage SSH public keys added to clusters created by roachprod",
	}
	sshKeysCmd.AddCommand(
		buildSSHKeysListCmd(),
		buildSSHKeysAddCmd(),
		buildSSHKeysRemoveCmd(),
	)
	return sshKeysCmd
}

func buildSSHKeysListCmd() *cobra.Command {
	return &cobra.Command{
		Use:   "list",
		Short: "list every SSH public key installed on clusters managed by roachprod",
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			authorizedKeys, err := gce.GetUserAuthorizedKeys()
			if err != nil {
				return err
			}

			return printPublicKeyTable(authorizedKeys, true /* includeSize */)
		}),
	}
}

func buildSSHKeysAddCmd() *cobra.Command {
	sshKeysAddCmd := &cobra.Command{
		Use:   "add <public-key-path> [--user user]",
		Short: "add a new SSH public key to the set of keys installed on clusters managed by roachprod",
		Args:  cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			sshKeyPath := args[0]
			pkBytes, err := os.ReadFile(sshKeyPath)
			if err != nil {
				return fmt.Errorf("error reading public key file: %w", err)
			}

			pubkey, comment, _, _, err := ssh.ParseAuthorizedKey(pkBytes)
			if err != nil {
				return fmt.Errorf("error parsing public key: %w", err)
			}

			ak := gce.AuthorizedKey{
				User:    sshKeyUser,
				Key:     pubkey,
				Comment: comment,
			}

			fmt.Printf("Adding new public key for user %s...\n", ak.User)
			return gce.AddUserAuthorizedKey(ak)
		}),
	}
	sshKeysAddCmd.Flags().StringVar(&sshKeyUser, "user", config.OSUser.Username,
		"the user to be associated with the new key",
	)
	return sshKeysAddCmd
}

func buildSSHKeysRemoveCmd() *cobra.Command {
	return &cobra.Command{
		Use:   "remove <user>",
		Short: "remove public keys belonging to a user from the set of keys installed on clusters managed by roachprod",
		Args:  cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			user := args[0]

			existingKeys, err := gce.GetUserAuthorizedKeys()
			if err != nil {
				return fmt.Errorf("failed to fetch existing keys: %w", err)
			}

			var toBeDeleted gce.AuthorizedKeys
			var newKeys gce.AuthorizedKeys
			for _, existing := range existingKeys {
				if existing.User == user {
					toBeDeleted = append(toBeDeleted, existing)
				} else {
					newKeys = append(newKeys, existing)
				}
			}

			if len(toBeDeleted) == 0 {
				fmt.Printf("No keys deleted.\n")
				return nil
			}

			fmt.Printf("The following keys are going to be deleted:\n")
			if err := printPublicKeyTable(toBeDeleted, false /* includeSize */); err != nil {
				return err
			}

			if PromptYesNo("Are you sure?", false /* defaultYes */) {
				fmt.Printf("Deleting %d keys belonging to %s...\n", len(toBeDeleted), user)
				return gce.SetUserAuthorizedKeys(newKeys)
			} else {
				fmt.Printf("Aborted.\n")
				return nil
			}
		}),
	}
}

func (cr *commandRegistry) buildPutCmd() *cobra.Command {
	putCmd := &cobra.Command{
		Use:   "put <cluster> <src> [<dest>]",
		Short: "copy a local file to the nodes in a cluster",
		Long: `Copy a local file to the nodes in a cluster.
`,
		Args: cobra.RangeArgs(2, 3),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			src := args[1]
			dest := path.Base(src)
			if len(args) == 3 {
				dest = args[2]
			}
			return roachprod.Put(context.Background(), config.Logger, args[0], src, dest, useTreeDist)
		}),
	}
	addHelpAboutNodes(putCmd)
	putCmd.Flags().BoolVar(&useTreeDist, "treedist", useTreeDist, "use treedist copy algorithm")
	initFlagInsecureIgnoreHostKeyForCmd(putCmd)
	initFlagSCPForCmd(putCmd)
	return putCmd
}

func (cr *commandRegistry) buildGetCmd() *cobra.Command {
	getCmd := &cobra.Command{
		Use:   "get <cluster> <src> [<dest>]",
		Short: "copy a remote file from the nodes in a cluster",
		Long: `Copy a remote file from the nodes in a cluster. If the file is retrieved from
multiple nodes the destination file name will be prefixed with the node number.
`,
		Args: cobra.RangeArgs(2, 3),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			src := args[1]
			dest := path.Base(src)
			if len(args) == 3 {
				dest = args[2]
			}
			return roachprod.Get(context.Background(), config.Logger, args[0], src, dest)
		}),
	}
	addHelpAboutNodes(getCmd)
	initFlagInsecureIgnoreHostKeyForCmd(getCmd)
	initFlagSCPForCmd(getCmd)
	return getCmd
}

func (cr *commandRegistry) buildStageCmd() *cobra.Command {
	stageCmd := &cobra.Command{
		Use:   "stage <cluster> <application> [<sha/version>]",
		Short: "stage cockroach binaries",
		Long: fmt.Sprintf(`Stages release and edge binaries to the cluster.

Currently available application options are:
  %s

Some examples of usage:
  -- stage edge build of cockroach build at a specific SHA:
  roachprod stage my-cluster cockroach e90e6903fee7dd0f88e20e345c2ddfe1af1e5a97

  -- Stage the most recent edge build of the workload tool:
  roachprod stage my-cluster workload

  -- Stage the official release binary of CockroachDB at version 2.0.5
  roachprod stage my-cluster release v2.0.5

  -- Stage customized binary of CockroachDB at version v23.2.0-alpha.2-4375-g7cd2b76ed00
  roachprod stage my-cluster customized v23.2.0-alpha.2-4375-g7cd2b76ed00
`, strings.TrimSpace(cockroachApp+workloadApp+releaseApp+customizedApp)),
		Args: cobra.RangeArgs(2, 3),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			versionArg := ""
			if len(args) == 3 {
				versionArg = args[2]
			}
			return roachprod.Stage(context.Background(), config.Logger, args[0], stageOS, stageArch, stageDir, args[1], versionArg)
		}),
	}
	initStageCmdFlags(stageCmd)
	return stageCmd
}

func (cr *commandRegistry) buildDownloadCmd() *cobra.Command {
	return &cobra.Command{
		Use:   "download <cluster> <url> <sha256> [DESTINATION]",
		Short: "download 3rd party tools",
		Long:  "Downloads 3rd party tools, using a GCS cache if possible.",
		Args:  cobra.RangeArgs(3, 4),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			src, sha := args[1], args[2]
			var dest string
			if len(args) == 4 {
				dest = args[3]
			}
			return roachprod.Download(context.Background(), config.Logger, args[0], src, sha, dest)
		}),
	}
}

func (cr *commandRegistry) buildStageURLCmd() *cobra.Command {
	stageURLCmd := &cobra.Command{
		Use:   "stageurl <application> [<sha/version>]",
		Short: "print URL to cockroach binaries",
		Long: `Prints URL for release and edge binaries.

Currently available application options are:
  cockroach  - Cockroach nightly builds. Can provide an optional SHA, otherwise
               latest build version is used.
  workload   - Cockroach workload application.
  release    - Official CockroachDB Release. Must provide a specific release
               version.
  customized - Cockroach customized builds, usually generated by running
               ./scripts/tag-custom-build.sh. Must provide a specific tag.
`,
		Args: cobra.RangeArgs(1, 2),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			versionArg := ""
			if len(args) == 2 {
				versionArg = args[1]
			}
			urls, err := roachprod.StageURL(config.Logger, args[0], versionArg, stageOS, stageArch)
			if err != nil {
				return err
			}
			for _, u := range urls {
				fmt.Println(u)
			}
			return nil
		}),
	}
	initStageURLCmdFlags(stageURLCmd)
	return stageURLCmd
}

func (cr *commandRegistry) buildSQLCmd() *cobra.Command {
	sqlCmd := &cobra.Command{
		Use:   "sql <cluster> -- [args]",
		Short: "run `cockroach sql` on a remote cluster",
		Long:  "Run `cockroach sql` on a remote cluster.\n",
		Args:  cobra.MinimumNArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			auth, ok := install.PGAuthModes[authMode]
			if !ok {
				return errors.Newf("unsupported auth-mode %s, valid auth-modes: %v", authMode, maps.Keys(install.PGAuthModes))
			}

			return roachprod.SQL(context.Background(), config.Logger, args[0], isSecure, virtualClusterName, sqlInstance, auth, database, args[1:])
		}),
	}
	addHelpAboutNodes(sqlCmd)
	initFlagAuthModeNDatabaseForCmd(sqlCmd)
	initFlagInsecureIgnoreHostKeyForCmd(sqlCmd)
	initFlagBinaryForCmd(sqlCmd)
	initFlagInsecureForCmd(sqlCmd)
	initFlagsClusterNSQLForCmd(sqlCmd)
	return sqlCmd
}

func (cr *commandRegistry) buildIPCmd() *cobra.Command {
	ipCmd := &cobra.Command{
		Use:   "ip <cluster>",
		Short: "get the IP addresses of the nodes in a cluster",
		Long: `Get the IP addresses of the nodes in a cluster.
`,
		Args: cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			ips, err := roachprod.IP(config.Logger, args[0], external)
			if err != nil {
				return err
			}
			for _, ip := range ips {
				fmt.Println(ip)
			}
			return nil
		}),
	}
	ipCmd.Flags().BoolVar(&external,
		"external", false, "return external IP addresses")
	initFlagInsecureIgnoreHostKeyForCmd(ipCmd)
	return ipCmd
}

func (cr *commandRegistry) buildPGUrlCmd() *cobra.Command {
	pgurlCmd := &cobra.Command{
		Use:   "pgurl <cluster> --auth-mode <auth-mode>",
		Short: "generate pgurls for the nodes in a cluster",
		Long: fmt.Sprintf(`Generate pgurls for the nodes in a cluster.

%[1]s
`, strings.TrimSpace(AuthModeHelp)),
		Args: cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			auth, err := install.ResolveAuthMode(authMode)
			if err != nil {
				return err
			}
			urls, err := roachprod.PgURL(context.Background(), config.Logger, args[0], pgurlCertsDir, roachprod.PGURLOptions{
				Database:           database,
				External:           external,
				Secure:             isSecure,
				VirtualClusterName: virtualClusterName,
				SQLInstance:        sqlInstance,
				Auth:               auth,
			})
			if err != nil {
				return err
			}
			fmt.Println(strings.Join(urls, " "))
			return nil
		}),
	}
	addHelpAboutNodes(pgurlCmd)
	initFlagPgurlCertsDirForCmd(pgurlCmd)
	initFlagAuthModeNDatabaseForCmd(pgurlCmd)
	pgurlCmd.Flags().BoolVar(&external,
		"external", false, "return pgurls for external connections")
	initFlagInsecureIgnoreHostKeyForCmd(pgurlCmd)
	initFlagInsecureForCmd(pgurlCmd)
	initFlagsClusterNSQLForCmd(pgurlCmd)
	return pgurlCmd
}

func (cr *commandRegistry) buildAdminurlCmd() *cobra.Command {
	adminurlCmd := &cobra.Command{
		Use:     "adminurl <cluster>",
		Aliases: []string{"admin", "adminui"},
		Short:   "generate admin UI URLs for the nodes in a cluster\n",
		Long: `Generate admin UI URLs for the nodes in a cluster.
`,
		Args: cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			urls, err := roachprod.AdminURL(
				context.Background(), config.Logger, args[0], virtualClusterName, sqlInstance, adminurlPath, adminurlIPs, urlOpen, isSecure,
			)
			if err != nil {
				return err
			}
			for _, url := range urls {
				fmt.Println(url)
			}
			return nil
		}),
	}
	addHelpAboutNodes(adminurlCmd)
	initAdminurlCmdFlags(adminurlCmd)
	initFlagOpenForCmd(adminurlCmd)
	initFlagInsecureIgnoreHostKeyForCmd(adminurlCmd)
	initFlagInsecureForCmd(adminurlCmd)
	initFlagsClusterNSQLForCmd(adminurlCmd)
	return adminurlCmd
}

func (cr *commandRegistry) buildLogsCmd() *cobra.Command {
	logsCmd := &cobra.Command{
		Use:   "logs",
		Short: "retrieve and merge logs in a cluster",
		Long: `Retrieve and merge logs in a cluster.

The "logs" command runs until terminated. It works similarly to get but is
specifically focused on retrieving logs periodically and then merging them
into a single stream.
`,
		Args: cobra.RangeArgs(1, 2),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			logsOpts := roachprod.LogsOpts{
				Dir: logsDir, Filter: logsFilter, ProgramFilter: logsProgramFilter,
				Interval: logsInterval, From: logsFrom, To: logsTo, Out: cmd.OutOrStdout(),
			}
			var dest string
			if len(args) == 2 {
				dest = args[1]
			} else {
				dest = args[0] + ".logs"
			}
			return roachprod.Logs(config.Logger, args[0], dest, logsOpts)
		}),
	}
	initLogsCmdFlags(logsCmd)
	return logsCmd
}

func (cr *commandRegistry) buildPprofCmd() *cobra.Command {
	pprofCmd := &cobra.Command{
		Use:     "pprof <cluster>",
		Args:    cobra.ExactArgs(1),
		Aliases: []string{"pprof-heap"},
		Short:   "capture a pprof profile from the specified nodes",
		Long: `Capture a pprof profile from the specified nodes.

Examples:

    # Capture CPU profile for all nodes in the cluster
    roachprod pprof CLUSTERNAME
    # Capture CPU profile for the first node in the cluster for 60 seconds
    roachprod pprof CLUSTERNAME:1 --duration 60s
    # Capture a Heap profile for the first node in the cluster
    roachprod pprof CLUSTERNAME:1 --heap
    # Same as above
    roachprod pprof-heap CLUSTERNAME:1
`,
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			if cmd.CalledAs() == "pprof-heap" {
				pprofOpts.Heap = true
			}
			return roachprod.Pprof(context.Background(), config.Logger, args[0], pprofOpts)
		}),
	}
	initPprofCmdFlags(pprofCmd)
	return pprofCmd
}

func (cr *commandRegistry) buildCachedHostsCmd() *cobra.Command {
	cachedHostsCmd := &cobra.Command{
		Use:   "cached-hosts",
		Short: "list all clusters (and optionally their host numbers) from local cache",
		Args:  cobra.NoArgs,
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			roachprod.CachedClusters(func(clusterName string, numVMs int) {
				if strings.HasPrefix(clusterName, "teamcity") {
					return
				}
				fmt.Printf("%s", clusterName)
				// When invoked by bash-completion, cachedHostsCluster is what the user
				// has currently typed -- if this cluster matches that, expand its hosts.
				if strings.HasPrefix(cachedHostsCluster, clusterName) {
					for i := 1; i <= numVMs; i++ {
						fmt.Printf(" %s:%d", clusterName, i)
					}
				}
				fmt.Printf("\n")
			})
			return nil
		}),
	}
	cachedHostsCmd.Flags().StringVar(&cachedHostsCluster,
		"cluster", "", "print hosts matching cluster")
	return cachedHostsCmd
}

func (cr *commandRegistry) buildVersionCmd() *cobra.Command {
	return &cobra.Command{
		Use:   `version`,
		Short: `print version information`,
		RunE: func(cmd *cobra.Command, args []string) error {
			fmt.Println(roachprod.Version(config.Logger))
			return nil
		},
	}
}

func (cr *commandRegistry) buildGetProvidersCmd() *cobra.Command {
	return &cobra.Command{
		Use:   `get-providers`,
		Short: `print providers state (active/inactive)`,
		RunE: func(cmd *cobra.Command, args []string) error {
			providers := roachprod.InitProviders()
			for provider, state := range providers {
				fmt.Printf("%s: %s\n", provider, state)
			}
			return nil
		},
	}
}

func (cr *commandRegistry) buildGrafanaStartCmd() *cobra.Command {
	grafanaStartCmd := &cobra.Command{
		Use:   `grafana-start <cluster>`,
		Short: `spins up a prometheus and grafana instance on the last node in the cluster; NOTE: for arm64 clusters, use --arch arm64`,
		Args:  cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			var grafanaDashboardJSONs []string
			var grafanaConfigURL string
			if grafanaConfig != "" {
				url, err := url.Parse(grafanaConfig)
				if err != nil {
					return err
				}
				switch url.Scheme {
				case "http", "https":
					grafanaConfigURL = grafanaConfig
				case "file", "":
					if data, err := grafana.GetDashboardJSONFromFile(url.Path); err != nil {
						return err
					} else {
						grafanaDashboardJSONs = []string{data}
					}
				default:
					return errors.Newf("unsupported scheme %s", url.Scheme)
				}
			} else {
				var err error
				if grafanaDashboardJSONs, err = grafana.GetDefaultDashboardJSONs(); err != nil {
					return err
				}
			}
			arch := vm.ArchAMD64
			if grafanaArch == "arm64" {
				arch = vm.ArchARM64
			}
			return roachprod.StartGrafana(context.Background(), config.Logger, args[0], arch,
				grafanaConfigURL, grafanaDashboardJSONs, nil)
		}),
	}
	initGrafanaStartCmdFlags(grafanaStartCmd)
	return grafanaStartCmd
}

func (cr *commandRegistry) buildGrafanaDumpCmd() *cobra.Command {
	grafanaDumpCmd := &cobra.Command{
		Use:   `grafana-dump <cluster>`,
		Short: `dump prometheus data to the specified directory`,
		Args:  cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			if grafanaDumpDir == "" {
				return errors.New("--dump-dir unspecified")
			}
			return roachprod.PrometheusSnapshot(context.Background(), config.Logger, args[0], grafanaDumpDir)
		}),
	}
	grafanaDumpCmd.Flags().StringVar(&grafanaDumpDir, "dump-dir", "",
		"the absolute path to dump prometheus data to (use the contained 'prometheus-docker-run.sh' to visualize")
	return grafanaDumpCmd
}

func (cr *commandRegistry) buildGrafanaStopCmd() *cobra.Command {
	return &cobra.Command{
		Use:   `grafana-stop <cluster>`,
		Short: `spins down prometheus and grafana instances on the last node in the cluster`,
		Args:  cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			return roachprod.StopGrafana(context.Background(), config.Logger, args[0], "")
		}),
	}
}

func (cr *commandRegistry) buildGrafanaURLCmd() *cobra.Command {
	grafanaURLCmd := &cobra.Command{
		Use:   `grafanaurl <cluster>`,
		Short: `returns a url to the grafana dashboard`,
		Args:  cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			url, err := roachprod.GrafanaURL(context.Background(), config.Logger, args[0],
				urlOpen)
			if err != nil {
				return err
			}
			fmt.Println(url)
			return nil
		}),
	}
	initFlagOpenForCmd(grafanaURLCmd)
	return grafanaURLCmd
}

func (cr *commandRegistry) buildGrafanaAnnotationCmd() *cobra.Command {
	grafanaAnnotationCmd := &cobra.Command{
		Use:   `grafana-annotation <host> <text> --tags [<tag1>, ...] --dashboard-uid <dashboard-uid> --time-range [<start-time>, <end-time>]`,
		Short: `adds an annotation to the specified grafana instance`,
		Long: fmt.Sprintf(`Adds an annotation to the specified grafana instance

By default, we assume the grafana instance needs an authentication token to connect
to. A service account json and audience will be read in from the environment
variables %s and %s to attempt authentication through google IDP. Use the --insecure
option when a token is not necessary.

--tags specifies the tags the annotation should have.

--dashboard-uid specifies the dashboard you want the annotation to be created in. If
left empty, creates the annotation in the organization instead.

--time-range can be used to specify in epoch millisecond time the annotation's timestamp.
If left empty, creates the annotation at the current time. If only start-time is specified,
creates an annotation at start-time. If both start-time and end-time are specified,
creates an annotation over time range.

Example:
# Create an annotation over time range 1-100 on the centralized grafana instance, which needs authentication.
roachprod grafana-annotation grafana.testeng.crdb.io example-annotation-event --tags my-cluster --tags test-run-1 --dashboard-uid overview --time-range 1,100
`, roachprodutil.ServiceAccountJson, roachprodutil.ServiceAccountAudience),
		Args: cobra.ExactArgs(2),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			req := grafana.AddAnnotationRequest{
				Text:         args[1],
				Tags:         grafanaTags,
				DashboardUID: grafanaDashboardUID,
			}

			switch len(grafanaTimeRange) {
			case 0:
				// Grafana API will default to adding annotation at current time.
			case 1:
				// Okay to only specify the start time.
				req.StartTime = grafanaTimeRange[0]
			case 2:
				req.StartTime = grafanaTimeRange[0]
				req.EndTime = grafanaTimeRange[1]
			default:
				return errors.Newf("Too many arguments for --time-range, expected 1 or 2, got: %d", len(grafanaTimeRange))
			}

			return roachprod.AddGrafanaAnnotation(context.Background(), args[0] /* host */, isSecure, req)
		}),
	}
	initGrafanaAnnotationCmdFlags(grafanaAnnotationCmd)
	initFlagInsecureForCmd(grafanaAnnotationCmd)
	return grafanaAnnotationCmd
}

func (cr *commandRegistry) buildRootStorageCmd() *cobra.Command {
	rootStorageCmd := &cobra.Command{
		Use:   `storage`,
		Short: "storage enables administering storage related commands and configurations",
		Args:  cobra.MinimumNArgs(1),
	}
	rootStorageCollectionCmd := &cobra.Command{
		Use: `collection`,
		Short: "the collection command allows for enable or disabling the storage workload " +
			"collector for a provided cluster (including a subset of nodes). The storage workload " +
			"collection is defined in pebble replay/workload_capture.go.",
		Args: cobra.MinimumNArgs(1),
	}
	rootStorageCmd.AddCommand(rootStorageCollectionCmd)
	rootStorageCollectionCmd.AddCommand(
		buildCollectionStartCmd(),
		buildCollectionStopCmd(),
		buildStorageSnapshotCmd(),
		buildCollectionListVolumes(),
	)
	return rootStorageCmd
}

func buildCollectionStartCmd() *cobra.Command {
	collectionStartCmd := &cobra.Command{
		Use:   `start <cluster>`,
		Short: "start the workload collector for a provided cluster (including a subset of nodes)",
		Args:  cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			cluster := args[0]
			return roachprod.StorageCollectionPerformAction(
				context.Background(),
				config.Logger,
				cluster,
				"start",
				volumeCreateOpts,
			)
		}),
	}
	initCollectionStartCmdFlags(collectionStartCmd)
	return collectionStartCmd
}

func buildCollectionStopCmd() *cobra.Command {
	return &cobra.Command{
		Use:   `stop <cluster>`,
		Short: "stop the workload collector for a provided cluster (including a subset of nodes)",
		Args:  cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			cluster := args[0]
			return roachprod.StorageCollectionPerformAction(
				context.Background(),
				config.Logger,
				cluster,
				"stop",
				volumeCreateOpts,
			)
		}),
	}
}

func buildCollectionListVolumes() *cobra.Command {
	return &cobra.Command{
		Use:   `list-volumes <cluster>`,
		Short: "list the nodes and their attached collector volumes",
		Args:  cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			cluster := args[0]
			return roachprod.StorageCollectionPerformAction(
				context.Background(),
				config.Logger,
				cluster,
				"list-volumes",
				volumeCreateOpts,
			)
		}),
	}
}

func buildStorageSnapshotCmd() *cobra.Command {
	return &cobra.Command{
		Use:   `snapshot <cluster> <name> <description>`,
		Short: "snapshot a clusters workload collector volume",
		Args:  cobra.ExactArgs(3),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			cluster := args[0]
			name := args[1]
			desc := args[2]
			_, err := roachprod.CreateSnapshot(context.Background(), config.Logger, cluster, vm.VolumeSnapshotCreateOpts{
				Name:        name,
				Description: desc,
			})
			return err
		}),
	}
}

func (cr *commandRegistry) buildSnapshotCmd() *cobra.Command {
	snapshotCmd := &cobra.Command{
		Use:   `snapshot`,
		Short: "snapshot enables creating/listing/deleting/applying cluster snapshots",
		Args:  cobra.MinimumNArgs(1),
	}
	snapshotCmd.AddCommand(
		buildSnapshotCreateCmd(),
		buildSnapshotListCmd(),
		buildSnapshotDeleteCmd(),
		buildSnapshotApplyCmd(),
	)
	return snapshotCmd
}

func buildSnapshotCreateCmd() *cobra.Command {
	return &cobra.Command{
		Use:   `create <cluster> <name> <description>`,
		Short: "snapshot a named cluster, using the given snapshot name and description",
		Args:  cobra.ExactArgs(3),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			cluster := args[0]
			name := args[1]
			desc := args[2]
			snapshots, err := roachprod.CreateSnapshot(context.Background(), config.Logger, cluster, vm.VolumeSnapshotCreateOpts{
				Name:        name,
				Description: desc,
			})
			if err != nil {
				return err
			}
			for _, snapshot := range snapshots {
				config.Logger.Printf("created snapshot %s (id: %s)", snapshot.Name, snapshot.ID)
			}
			return nil
		}),
	}
}

func buildSnapshotListCmd() *cobra.Command {
	return &cobra.Command{
		Use:   `list <provider> [<name>]`,
		Short: "list all snapshots for the given cloud provider, optionally filtering by the given name",
		Args:  cobra.RangeArgs(1, 2),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			provider := args[0]
			var name string
			if len(args) == 2 {
				name = args[1]
			}
			snapshots, err := roachprod.ListSnapshots(context.Background(), config.Logger, provider,
				vm.VolumeSnapshotListOpts{
					NamePrefix: name,
				},
			)
			if err != nil {
				return err
			}
			for _, snapshot := range snapshots {
				config.Logger.Printf("found snapshot %s (id: %s)", snapshot.Name, snapshot.ID)
			}
			return nil
		}),
	}
}

func buildSnapshotDeleteCmd() *cobra.Command {
	snapshotDeleteCmd := &cobra.Command{
		Use:   `delete <provider> <name>`,
		Short: "delete all snapshots for the given cloud provider optionally filtering by the given name",
		Args:  cobra.ExactArgs(2),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			ctx := context.Background()
			provider, name := args[0], args[1]
			snapshots, err := roachprod.ListSnapshots(ctx, config.Logger, provider,
				vm.VolumeSnapshotListOpts{
					NamePrefix: name,
				},
			)
			if err != nil {
				return err
			}

			for _, snapshot := range snapshots {
				config.Logger.Printf("deleting snapshot %s (id: %s)", snapshot.Name, snapshot.ID)
			}
			if !dryrun {
				if err := roachprod.DeleteSnapshots(ctx, config.Logger, provider, snapshots...); err != nil {
					return err
				}
			}
			config.Logger.Printf("done")
			return nil
		}),
	}
	snapshotDeleteCmd.Flags().BoolVar(&dryrun,
		"dry-run", false, "dry run (don't perform any actions)")
	return snapshotDeleteCmd
}

func buildSnapshotApplyCmd() *cobra.Command {
	return &cobra.Command{
		Use:   `apply <provider> <name> <cluster> `,
		Short: "apply the named snapshots from the given cloud provider to the named cluster",
		Args:  cobra.ExactArgs(3),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			ctx := context.Background()
			provider, name, cluster := args[0], args[1], args[2]
			snapshots, err := roachprod.ListSnapshots(ctx, config.Logger, provider,
				vm.VolumeSnapshotListOpts{
					NamePrefix: name,
				},
			)
			if err != nil {
				return err
			}

			return roachprod.ApplySnapshots(ctx, config.Logger, cluster, snapshots, vm.VolumeCreateOpts{
				Size: 500, // TODO(irfansharif): Make this configurable?
				Labels: map[string]string{
					vm.TagUsage: "roachprod",
				},
			})
		}),
	}
}

func (cr *commandRegistry) buildUpdateCmd() *cobra.Command {
	updateCmd := &cobra.Command{
		Use:   "update",
		Short: "check gs://cockroach-nightly for a new roachprod binary; update if available",
		Long: "Attempts to download the latest roachprod binary (on master) from gs://cockroach-nightly. " +
			" Swaps the current binary with it. The current roachprod binary will be backed up" +
			" and can be restored via `roachprod update --revert`.",
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			// We only have prebuilt binaries for Linux. See #120750.
			if runtime.GOOS != "linux" {
				return errors.New("this command is only available on Linux at this time")
			}

			currentBinary, err := os.Executable()
			if err != nil {
				return err
			}

			if roachprodUpdateRevert {
				if PromptYesNo("Revert to previous version? Note: this will replace the"+
					" current roachprod binary with a previous roachprod.bak binary.", true /* defaultYes */) {
					if err := SwapBinary(currentBinary, currentBinary+".bak"); err != nil {
						return err
					}
					fmt.Println("roachprod successfully reverted, run `roachprod -v` to confirm.")
				}
				return nil
			}

			newBinary := currentBinary + ".new"
			if err :=
				DownloadLatestRoachprod(newBinary, roachprodUpdateBranch, roachprodUpdateOS, roachprodUpdateArch); err != nil {
				return err
			}

			if PromptYesNo("Continue with update? This will overwrite any existing roachprod.bak binary.", true /* defaultYes */) {
				if err := SwapBinary(currentBinary, newBinary); err != nil {
					return errors.WithDetail(err, "unable to update binary")
				}

				fmt.Println("Update successful: run `roachprod -v` to confirm.")
			}
			return nil
		}),
	}
	initUpdateCmdFlags(updateCmd)
	return updateCmd
}

func (cr *commandRegistry) buildJaegerStartCmd() *cobra.Command {
	jaegerStartCmd := &cobra.Command{
		Use:   `jaeger-start <cluster>`,
		Short: `starts a jaeger container on the last node in the cluster`,
		Args:  cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			return roachprod.StartJaeger(context.Background(), config.Logger, args[0],
				virtualClusterName, isSecure, jaegerConfigNodes)
		}),
	}
	jaegerStartCmd.Flags().StringVar(&jaegerConfigNodes, "configure-nodes", "",
		"the nodes on which to set the relevant CRDB cluster settings")
	initFlagInsecureForCmd(jaegerStartCmd)
	initFlagsClusterNSQLForCmd(jaegerStartCmd)
	return jaegerStartCmd
}

func (cr *commandRegistry) buildJaegerStopCmd() *cobra.Command {
	return &cobra.Command{
		Use:   `jaeger-stop <cluster>`,
		Short: `stops a running jaeger container on the last node in the cluster`,
		Args:  cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			return roachprod.StopJaeger(context.Background(), config.Logger, args[0])
		}),
	}
}

func (cr *commandRegistry) buildJaegerURLCmd() *cobra.Command {
	jaegerURLCmd := &cobra.Command{
		Use:   `jaegerurl <cluster>`,
		Short: `returns the URL of the cluster's jaeger UI`,
		Args:  cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			url, err := roachprod.JaegerURL(context.Background(), config.Logger, args[0],
				urlOpen)
			if err != nil {
				return err
			}
			fmt.Println(url)
			return nil
		}),
	}
	initFlagOpenForCmd(jaegerURLCmd)
	return jaegerURLCmd
}

func (cr *commandRegistry) buildSideEyeRootCmd() *cobra.Command {
	sideEyeRootCmd := &cobra.Command{
		Use:   "side-eye",
		Short: "interact with side-eye.io functionality",
		Long: `Interact with side-eye.io functionality

Side-Eye (app.side-eye.io) is a distributed debugger that can be used to capture
snapshots of a CockroachDB cluster.
`,
		Args: cobra.MinimumNArgs(1),
	}
	sideEyeRootCmd.AddCommand(buildSideEyeInstallCmd())
	sideEyeRootCmd.AddCommand(buildSideEyeSnapCmd())
	return sideEyeRootCmd
}

func buildSideEyeInstallCmd() *cobra.Command {
	return &cobra.Command{
		Use:   "install <cluster>",
		Short: "install and start the Side-Eye agents on all nodes in the cluster",
		Long: `Install and start the Side-Eye agents on all nodes in the cluster

` + "`roachprod side-eye snapshot <cluster>`" + ` can then be used to capture cluster snapshots.
`,
		Args: cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			cluster := args[0]

			ctx := context.Background()
			l := config.Logger
			sideEyeToken, ok := roachprod.GetSideEyeTokenFromEnv()
			if !ok {
				return errors.New("Side-Eye token is not configured via SIDE_EYE_API_TOKEN or gcloud secret")
			}

			return roachprod.StartSideEyeAgents(ctx, l, cluster, cluster /* envName */, sideEyeToken)
		}),
	}
}

func buildSideEyeSnapCmd() *cobra.Command {
	return &cobra.Command{
		Use:     "snapshot <cluster/Side-Eye environment>",
		Aliases: []string{"snap"},
		Short:   "capture a cluster snapshot",
		Long: `Capture a cluster snapshot using Side-Eye

The command will print an app.side-eye.io URL where the snapshot can be viewed.
`,
		Args: cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			cluster := args[0]
			ctx := context.Background()
			l := config.Logger
			l.PrintfCtx(ctx, "capturing snapshot of the cluster with Side-Eye...")
			snapURL, ok := roachprod.CaptureSideEyeSnapshot(context.Background(), config.Logger, cluster, nil /* client */)
			if ok {
				l.PrintfCtx(ctx, "captured Side-Eye snapshot: %s", snapURL)
			}
			return nil
		}),
	}
}

func (cr *commandRegistry) buildFluentBitStartCmd() *cobra.Command {
	fluentBitStartCmd := &cobra.Command{
		Use:   "fluent-bit-start <cluster>",
		Short: "Install and start Fluent Bit",
		Long:  "Install and start Fluent Bit",
		Args:  cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			return roachprod.StartFluentBit(context.Background(), config.Logger, args[0], fluentBitConfig)
		}),
	}
	initFluentBitStartCmdFlags(fluentBitStartCmd)
	return fluentBitStartCmd
}

func (cr *commandRegistry) buildFluentBitStopCmd() *cobra.Command {
	return &cobra.Command{
		Use:   "fluent-bit-stop <cluster>",
		Short: "Stop Fluent Bit",
		Long:  "Stop Fluent Bit",
		Args:  cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			return roachprod.StopFluentBit(context.Background(), config.Logger, args[0])
		}),
	}
}

func (cr *commandRegistry) buildOpentelemetryStartCmd() *cobra.Command {
	opentelemetryStartCmd := &cobra.Command{
		Use:   "opentelemetry-start <cluster>",
		Short: "Install and start the OpenTelemetry Collector",
		Long:  "Install and start the OpenTelemetry Collector",
		Args:  cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			return roachprod.StartOpenTelemetry(context.Background(), config.Logger, args[0], opentelemetryConfig)
		}),
	}
	initOpentelemetryStartCmdFlags(opentelemetryStartCmd)
	return opentelemetryStartCmd
}

func (cr *commandRegistry) buildOpentelemetryStopCmd() *cobra.Command {
	return &cobra.Command{
		Use:   "opentelemetry-stop <cluster>",
		Short: "Stop the OpenTelemetry Collector",
		Long:  "Stop the OpenTelemetry Collector",
		Args:  cobra.ExactArgs(1),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			return roachprod.StopOpenTelemetry(context.Background(), config.Logger, args[0])
		}),
	}
}

func (cr *commandRegistry) buildFetchLogsCmd() *cobra.Command {
	fetchLogsCmd := &cobra.Command{
		Use:     "fetchlogs <cluster> <destination (optional)> [flags]",
		Aliases: []string{"getlogs"},
		Short:   "download the logs from the cluster",
		Long: `Download the logs from the cluster using "roachprod get".

The logs will be placed in the directory if specified or in the directory named as <clustername>_logs.
`,
		Args: cobra.RangeArgs(1, 2),
		Run: wrap(func(cmd *cobra.Command, args []string) error {
			cluster := args[0]
			ctx := context.Background()
			var dest string
			if len(args) == 2 {
				dest = args[1]
			} else {
				// trim the node number and keep only the cluster name as prefix of the directory
				dest = fmt.Sprintf("%s_logs", strings.Split(args[0], ":")[0])
				fmt.Printf("Placing logs at %s\n", dest)
			}
			if err := os.Mkdir(dest, 0755); err != nil {
				return err
			}
			return roachprod.FetchLogs(ctx, config.Logger, cluster, dest,
				fetchLogsTimeout)
		}),
	}
	fetchLogsCmd.Flags().DurationVarP(&fetchLogsTimeout,
		"timeout", "t", 5*time.Minute, "Timeout for fetching the logs from the cluster nodes")
	return fetchLogsCmd
}

func (cr *commandRegistry) buildGetLatestPProfCmd() *cobra.Command {
	return &cobra.Command{
		Use:   "get-latest-pprof <cluster> [time-before]",
		Short: "downloads the latest pprof file which is created on or before the provided time-before.",
		Long: `Downloads the latest pprof file which is created on or before the provided time-before.
The time should be of the format 2022-08-31T15:23:22Z for UTC or 2022-08-31T15:23:22+05:30 for time zone.
If the time is not provided, it downloads the latest pprof file across all clusters.
`,
		Args: cobra.MinimumNArgs(1),
		// Wraps the command execution with additional error handling
		Run: wrap(func(cmd *cobra.Command, args []string) (retErr error) {
			cluster := args[0]
			pprofTimeBefore := ""
			if len(args) == 2 {
				// time-before is optional
				pprofTimeBefore = args[1]
			}
			ctx := context.Background()
			return roachprod.DownloadLatestPProfFile(ctx, config.Logger, cluster, pprofTimeBefore)
		}),
	}
}
