import time

import requests
import asyncio
import os
import argparse
import pandas as pd
import numpy as np


def get_args():
    parser = argparse.ArgumentParser(__name__,
                                     description=f'{__name__} as data downloader',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--SCRAPER_API_URL', type=str, default='http://localhost:5011', help='URL of the scraper api')
    return parser.parse_args()


def get_categories():
    response = requests.get(f'{os.environ["SCRAPER_API_URL"]}/categories/nl')
    return [category['topic'] for category in response.json()]


def get_category_articles(category: str):
    bases_url = f'{os.environ["SCRAPER_API_URL"]}/category/all/{category}'
    return [category, requests.get(bases_url)]


async def get_category_articles_async(category: str):
    return await asyncio.to_thread(get_category_articles, category)


def get_full_article(id, payload):
    bases_url = f'{os.environ["SCRAPER_API_URL"]}/category/all/article'
    response = requests.post(url=bases_url, json=payload)

    if response.status_code == 204:
        response = requests.post(url=f'{bases_url}/fallback', json=payload)

    return [id, payload, response]


async def get_full_article_async(article_item):
    payload = {
        'pageLink': article_item[1],
        'imageLink': '' if article_item[2] is np.nan else article_item[2],
        'topic': '' if article_item[3] is np.nan else article_item[3],
        'title': '' if article_item[4] is np.nan else article_item[4],
        'teaser': '' if article_item[5] is np.nan else article_item[5]
    }

    return await asyncio.to_thread(get_full_article, article_item[0], payload)


async def compute_all_article_items():
    all_categories = get_categories()

    print(f'Download started for {all_categories} ...')
    results = await asyncio.gather(*[get_category_articles_async(category) for category in all_categories])

    all_articles = []
    cats = []
    for response in results:
        r = response[1].json()
        cat = [response[0]] * len(r)
        print(f'{cat[0]} -> {len(r)}, {len(cat)}')
        cats += cat
        all_articles += r

    print(f'Articles: {len(all_articles)} \t Cats: {len(cats)}')

    print('Storing all articles...')
    raw_data = pd.DataFrame(all_articles)
    raw_data['category'] = cats
    raw_data.to_csv('data/articles.csv', index=False)
    print('Storing done')


async def compute_all_full_articles():
    raw_data = pd.read_csv('data/articles.csv')

    print(f'Download started for {len(raw_data)} article items ...')

    results = await asyncio.gather(
        *[get_full_article_async(article_item) for article_item in raw_data.values[:20]])

    print('Storing all articles * content...')

    all_articles = []

    for response in results:
        sections = response[2].json()
        if sections['sectionList'] is None:
            continue
        for section in sections['sectionList']:
            if section['contentBody'] is None:
                continue
            for idx, content in enumerate(section['contentBody']):
                all_articles.append([response[0], idx, content['content']])

    print(f'Articles * contents: {len(all_articles)}')

    raw_data = pd.DataFrame(all_articles, columns=['id', 'inner_id', 'content'])

    raw_data.to_csv('data/articles_x_contents.csv', index=False)

    print('Storing done')


async def main():
    await compute_all_article_items()
    time.sleep(20)
    await compute_all_full_articles()


if __name__ == "__main__":
    args = get_args()
    os.environ["SCRAPER_API_URL"] = args.SCRAPER_API_URL
    asyncio.run(main())
