#!/usr/bin/env python3

__author__ = "Ben Woodcroft"
__copyright__ = "Copyright 2021"
__credits__ = ["Ben Woodcroft"]
__license__ = "GPL3+"
__maintainer__ = "Ben Woodcroft"
__email__ = "b.woodcroft near qut.edu.au"
__status__ = "Development"

import os
import sys
import logging
import re

from bird_tool_utils import *
from bird_tool_utils.people import *

sys.path = [os.path.join(os.path.dirname(os.path.realpath(__file__)),'..')] + sys.path
import kingfisher

def fix(help_string):
    multispace_re = re.compile('  *')
    return(multispace_re.sub(' ', help_string))

def add_extraction_args(parser):
    parser.add_argument(
        '-f', '--output-format-possibilities', '--output_format_possibilities',
        nargs='+',
        help='Allowable output formats. If more than one is specified, downloaded data will processed as little as possible [default: "{}"]'.format(
            ' '.join(kingfisher.DEFAULT_OUTPUT_FORMAT_POSSIBILITIES)
        ),
        choices=['sra', 'fastq', 'fastq.gz','fasta','fasta.gz'],
        default=kingfisher.DEFAULT_OUTPUT_FORMAT_POSSIBILITIES)
    parser.add_argument(
        '--force',
        action='store_true',
        help='Re-download / extract files even if they already exist [default: Do not].')
    parser.add_argument(
        '--unsorted',
        action='store_true',
        # Possible to specify --unsorted with -m ena-ftp aws-http, for example, which means that the order will be unknown.
        help=fix('Output the sequences in arbitrary order, usually the order that they appear in the .sra file. \
            Even pairs of reads may be in the usual order, but it is possible to tell which pair \
            is which, and which is a forward and which is a reverse read from the name \
            [default: Do not].\n\n\
            Currently requires download from NCBI rather than ENA.'))
    parser.add_argument(
        '--stdout',
        action='store_true',
        help=fix('Output sequences to STDOUT. Currently requires --unsorted [default: Do not].'))
    return parser

def check_get_and_extract_common_args(args):
    if args.output_directory and args.stdout:
        logging.error("--output-directory and --stdout are incompatible")
        sys.exit(1)


def main():
    bird_argparser = BirdArgparser(
        program='Kingfisher',
        authors=[BEN_NAME_AND_CENTRE_AND_EMAIL],
        version=kingfisher.__version__,
        raw_format=True,
        examples={'get': [
            Example(
                ('Download .fastq.gz files of the run ERR1739691 from the ENA, or failing that, download an .sra file from the Amazon AWA Open Data Program and then convert'
                ' to FASTQ (not FASTQ.GZ), or failing that use NCBI prefetch to download and convert that to FASTQ. Output files are put into the current working directory.'),
                'kingfisher get -r ERR1739691 -m ena-ascp aws-http prefetch'),
            Example(
                'Download a .sra from GCP using a service account key with "gcp cp". Payment is required.',
                'kingfisher get -r ERR1739691 -m gcp-cp -f sra --gcp-user-key-file sa-private-key.json --allow-paid'),
            Example(
                'Download a .sra from the free AWS open data program using 8 threads for download and extraction, coverting to FASTA.',
                'kingfisher get -r ERR1739691 -m aws-http -f fasta --download-threads 8')],
            'extract': [
                Example(
                    'Extract an SRA file to FASTQ.GZ format using 16 threads (default is 8)',
                    'kingfisher extract --sra ERR1739691.sra -t 16 -f fastq.gz'
                    )],
            'annotate': [
                Example(
                    'Annotate the metadata of a run',
                    'kingfisher annotate -r ERR1739691'),
                Example(
                    'Output metadata of all runs in a BioProject to a CSV file',
                    'kingfisher annotate --bioprojects PRJNA177893 -o PRJNA177893.csv -f csv'),
                Example(
                    'Output the full set of metadata from a run',
                    'kingfisher annotate -r ERR1739691 -a'),
                ]})

    get_description = 'Download and extract sequence data from SRA or ENA'
    get_parser = bird_argparser.new_subparser('get', get_description)

    get_parser_common_options = get_parser.add_argument_group(title='common options')
    get_parser_common_options.add_argument(
        '-r','--run-identifiers','--run_identifiers',
        help='Run number(s) to download/extract e.g. ERR1739691',
        nargs='+')
    get_parser_common_options.add_argument(
        '--run-identifiers-list','--run_identifiers_list',
        help='Text file containing a newline-separated list of run identifiers i.e. a 1 column CSV file.')
    get_parser_common_options.add_argument(
        '-p','--bioprojects', nargs='+',
        help='BioProject IDs number(s) to download/extract from e.g. PRJNA621514 or SRP260223')
    
    get_parser_common_options.add_argument(
        '-m','--download_methods', '--download-methods',
        nargs='+',
        help='How to download .sra file. If multiple are specified, each is tried in turn until one works [required].\n\n' + 
        table_roff([
            ["Method",'Description'],
            ['ena-ascp','Download .fastq.gz files from ENA using Aspera, which can then be further converted. This is the fastest method since no fasterq-dump is required.'],
            ['ena-ftp','Download .fastq.gz files from ENA using curl, which can then be further converted. This is relatively fast since no fasterq-dump is required.'],
            ['prefetch','Download .SRA file using NCBI prefetch from sra-tools, which is then extracted with fasterq-dump.'],
            ['aws-http','Download .SRA file from AWS Open Data Program using `aria2c` with multiple connection threads, which is then extracted with `fasterq-dump`.'],
            ['aws-cp','Download .SRA file from AWS using aws s3 cp, which is then extracted with fasterq-dump. Does not usually require payment or an AWS account.'],
            ['gcp-cp','Download .SRA file from Google Cloud gsutil, which is then extracted with fasterq-dump. Requires payment and a Google Cloud account.']
            ]),
        choices=['aws-http', 'prefetch', 'aws-cp', 'gcp-cp', 'ena-ascp','ena-ftp'], required=True)
    get_parser_common_options.add_argument(
        '--output-directory', '--output_directory',
        help=fix('Output directory to write to [default: current working directory]'))
    
    get_parser_download_args = get_parser.add_argument_group(title='further download options')
    get_parser_download_args.add_argument(
        '--download-threads', '--download_threads',
        type=int,
        help='Number of connection threads to use when downloading data. When >1 aria2 is used rather than curl [default: {}]'.format(
            kingfisher.DEFAULT_DOWNLOAD_THREADS),
        default=kingfisher.DEFAULT_DOWNLOAD_THREADS,
    )
    get_parser_download_args.add_argument(
        '--hide-download-progress', '--hide_download_progress',
        action='store_true',
        help='Do not show progressbar during download [default: unset i.e. show progress]',
    )
    get_parser_download_args.add_argument(
        '--ascp-ssh-key', '--ascp_ssh_key',
        help=fix('a path to the openssh key to used for aspera (i.e. the \
            -i flag of ascp) [default: Use the one bundled with Kingfisher]'))
    get_parser_download_args.add_argument(
        '--ascp-args', '--ascp_args',
        help=fix('extra arguments to pass to ascp e.g. \'-k 2\' to resume with a \
        sparse file checksum [default: \'{}\']'.format(kingfisher.DEFAULT_ASCP_ARGS)),
        default=kingfisher.DEFAULT_ASCP_ARGS)
    get_parser_download_args.add_argument(
        '--allow-paid', '--allow_paid',
        help='Allow downloading from retriever-pays s3 and GCP buckets [default: Do not]',
        action='store_true')
    get_parser_download_args.add_argument(
        '--allow-paid-from-aws', '--allow_paid_from_aws',
        help='Allow downloading from retriever-pays AWS buckets [default: Do not]',
        action='store_true')
    get_parser_download_args.add_argument(
        '--aws-user-key-id', '--aws_user_key_id',
        help='Downloading from AWS requester pays buckets requires a key ID and secret key '\
            '[default: not used]')
    get_parser_download_args.add_argument(
        '--aws-user-key-secret', '--aws_user_key_secret',
        help=fix('Downloading from AWS requester pays buckets requires a key ID and secret key \
            [default: not used]'))
    get_parser_download_args.add_argument(
        '--guess-aws-location', '--guess_aws_location',
        action='store_true',
        help=fix('Instead of using the NCBI location API, guess the address of the file in AWS \
            [default: not used]'))
    get_parser_download_args.add_argument(
        '--allow-paid-from-gcp', '--allow_paid_from_gcp',
        help='Allow downloading from retriever-pays GCP buckets [default: Do not]',
        action='store_true')
    get_parser_download_args.add_argument(
        '--gcp-project', '--gcp_project',
        help=fix('Downloading from Google Cloud buckets require a Google project to charge '
        '(they are requester-pays) e.g. \'my-project\'. This can alternately be set '
        'beforehand using \'gcloud config set project PROJECT_ID\' '
        '[default: value of `gcloud config get-value project` command]'))
    get_parser_download_args.add_argument(
        '--gcp-user-key-file', '--gcp_user_key_file',
        help=fix('Downloading from Google Cloud buckets requires a Google user to be setup. \
            Use this option to specify a JSON-formatted service account key, as per \
            https://cloud.google.com/iam/docs/creating-managing-service-account-keys \
            [default: not used]'))
    get_parser_download_args.add_argument(
        '--prefetch-max-size', '--prefetch_max_size',
        help=fix('Downloading with prefetch has a default limit of 20G file size. \
            Kingfisher disables this. Use this option to reinstate this file size limit \
            e.g. --prefetch-max-size "1G" for a 1 GB limit \
            [default: not used]'))
    get_parser_download_args.add_argument(
        '--check-md5sums', '--check_md5sums',
        help=fix('Check md5sums of downloaded files. This is only implemented for ena-ftp, ena-ascp and aws-http download methods. \
            The prefetch, aws-cp and gcp-cp methods calculate checksums as part of the download process. \
            [default: not used]'),
        action='store_true')

    get_parser_extraction_args = get_parser.add_argument_group(title='further extraction options')
    add_extraction_args(get_parser_extraction_args)
    get_parser_extraction_args.add_argument(
        '-t', '--extraction-threads',
        type=int,
        help='Number of threads to use when extracting .sra files. Ignored when --unsorted is specified. [default: {}]'.format(
            kingfisher.DEFAULT_THREADS
        ),
        default=kingfisher.DEFAULT_THREADS,
    )

    extract_description = 'Extract .sra format files into FASTQ or FASTA format, compressed or uncompressed.'
    extract_parser = bird_argparser.new_subparser('extract', extract_description)
    extract_parser_extraction_group = extract_parser.add_argument_group(title='extraction options')

    extract_parser_extraction_group.add_argument(
        '--sra',
        help='Extract this SRA file [required]',
        required=True)
    extract_parser_extraction_group.add_argument(
        '--output-directory', '--output_directory',
        help=fix('Output directory to write to [default: current working directory]'))
    add_extraction_args(extract_parser_extraction_group)
    extract_parser_extraction_group.add_argument(
        '-t', '--threads',
        type=int,
        help='Number of threads to use for extraction [default: {}]'.format(
            kingfisher.DEFAULT_THREADS),
        default=kingfisher.DEFAULT_THREADS)

    annotate_description = 'Annotate runs by their metadata e.g. number of sequenced bases, BioSample attributes, etc.'
    annotate_parser = bird_argparser.new_subparser('annotate', annotate_description)

    annotate_parser.add_argument(
        '-r','--run-identifiers','--run_identifiers',
        help='Run number to download/extract e.g. ERR1739691',
        nargs='+',
    )
    annotate_parser.add_argument(
        '--run-identifiers-list','--run_identifiers_list','--run-accession-list','--run_accession_list','--run-identifiers-list','--run_identifiers_list',
        help='Text file containing a newline-separated list of run identifiers i.e. a 1 column CSV file.',
    )
    annotate_parser.add_argument(
        '-p','--bioprojects', nargs='+',
        help='BioProject IDs number(s) to download/extract from e.g. PRJNA621514 or SRP260223')
    annotate_parser.add_argument(
        '-o','--output-file',
        help='Output file to write to [default: stdout]'
    )
    annotate_parser.add_argument(
        '-f','--output-format','--output_format',
        help='Output format [default human]',
        default='human',
        choices=['human','csv','tsv','json','feather','parquet']
    )
    annotate_parser.add_argument(
        '-a','--all-columns','--all_columns',
        help='Print all metadata columns [default: Print only a few select ones]',
        action='store_true',
    )

    authorship_description = 'Find publication / authorship of SRA accessions'
    authorship_parser = bird_argparser.new_subparser('authorship', authorship_description)
    authorship_parser.add_argument(
        '-r','--run-identifiers','--run_identifiers',
        help='Run number to download/extract e.g. ERR1914274',
        nargs='+',
    )
    # list
    authorship_parser.add_argument(
        '--run-identifiers-list','--run_identifiers_list','--run-accession-list','--run_accession_list','--run-identifiers-list','--run_identifiers_list',
        help='Text file containing a newline-separated list of run identifiers i.e. a 1 column CSV file.',
    )

    args = bird_argparser.parse_the_args()

    logging.info("Kingfisher v{}".format(kingfisher.__version__))

    if args.subparser_name == 'get':
        check_get_and_extract_common_args(args)
        kingfisher.download_and_extract(
            run_identifiers = args.run_identifiers,
            run_identifiers_file = args.run_identifiers_list,
            bioproject_accessions = args.bioprojects,
            download_methods = args.download_methods,
            output_format_possibilities = args.output_format_possibilities,
            force = args.force,
            unsorted = args.unsorted,
            stdout = args.stdout,
            gcp_project = args.gcp_project,
            gcp_user_key_file = args.gcp_user_key_file,
            aws_user_key_id = args.aws_user_key_id,
            aws_user_key_secret = args.aws_user_key_secret,
            guess_aws_location = args.guess_aws_location,
            allow_paid = args.allow_paid,
            allow_paid_from_gcp = args.allow_paid_from_gcp,
            allow_paid_from_aws = args.allow_paid_from_aws,
            ascp_ssh_key = args.ascp_ssh_key,
            ascp_args = args.ascp_args,
            download_threads = args.download_threads,
            extraction_threads = args.extraction_threads,
            hide_download_progress = args.hide_download_progress,
            prefetch_max_size = args.prefetch_max_size,
            check_md5sums = args.check_md5sums,
            output_directory = args.output_directory if args.output_directory is not None else '.',
        )
    elif args.subparser_name == 'extract':
        output_files = kingfisher.extract(
            sra_file = args.sra,
            output_format_possibilities = args.output_format_possibilities,
            force = args.force,
            unsorted = args.unsorted,
            stdout = args.stdout,
            threads = args.threads,
            output_directory = args.output_directory if args.output_directory is not None else '.',
        )
        logging.info("Output files: {}".format(', '.join(output_files)))
    elif args.subparser_name == 'annotate':
        if args.output_format in ('feather','parquet') and not args.output_file:
            logging.error("--output-file is required when --output-format is {}".format(args.output_format))
            sys.exit(1)
        kingfisher.annotate(
            run_identifiers = args.run_identifiers,
            run_identifiers_file = args.run_identifiers_list,
            bioproject_accessions = args.bioprojects,
            output_file = args.output_file,
            output_format = args.output_format,
            all_columns = args.all_columns,
        )
    elif args.subparser_name == 'authorship':
        kingfisher.authorship(
            run_identifiers = args.run_identifiers,
            run_identifiers_file = args.run_identifiers_list,
        )
    else:
        raise Exception("Programming error")

    logging.info("Kingfisher done.")



if __name__ == '__main__':
    main()
