
CREATE TABLE IF NOT EXISTS EC_DOMAIN (
    ID INT PRIMARY KEY AUTO_INCREMENT,

    DOMAIN_NAME VARCHAR(255) UNIQUE NOT NULL,
    DOMAIN_TOP VARCHAR(255) NOT NULL,

    INDEXED INT DEFAULT 0 NOT NULL COMMENT "~number of documents visited / 100",
    STATE ENUM('ACTIVE', 'EXHAUSTED', 'SPECIAL', 'SOCIAL_MEDIA', 'BLOCKED', 'REDIR', 'ERROR', 'UNKNOWN') NOT NULL DEFAULT 'active' COMMENT "@see EdgeDomainIndexingState",

    RANK DOUBLE,
    DOMAIN_ALIAS INTEGER,
    IP VARCHAR(48),

    INDEX_DATE TIMESTAMP DEFAULT NOW(),
    DISCOVER_DATE TIMESTAMP DEFAULT NOW(),

    IS_ALIVE BOOLEAN AS (STATE='ACTIVE' OR STATE='EXHAUSTED' OR STATE='SPECIAL' OR STATE='SOCIAL_MEDIA') VIRTUAL
)
CHARACTER SET utf8mb4
COLLATE utf8mb4_unicode_ci;


CREATE TABLE IF NOT EXISTS EC_URL (
    ID INT PRIMARY KEY AUTO_INCREMENT,
    DOMAIN_ID INT NOT NULL,

    PROTO ENUM('http','https','gemini') NOT NULL COLLATE utf8mb4_unicode_ci,
    PATH VARCHAR(255) NOT NULL,
    PORT INT,
    PARAM VARCHAR(255),

    PATH_HASH BIGINT NOT NULL COMMENT "Hash of PATH for uniqueness check by domain",

    VISITED BOOLEAN NOT NULL DEFAULT FALSE,

    STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok' COLLATE utf8mb4_unicode_ci,

    CONSTRAINT CONS UNIQUE (DOMAIN_ID, PATH_HASH),
    FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE
)
CHARACTER SET utf8mb4
COLLATE utf8mb4_bin;

CREATE TABLE IF NOT EXISTS EC_PAGE_DATA (
    ID INT PRIMARY KEY AUTO_INCREMENT,

    TITLE VARCHAR(255) NOT NULL,
    DESCRIPTION VARCHAR(255) NOT NULL,

    WORDS_TOTAL INTEGER NOT NULL,
    FORMAT ENUM('PLAIN', 'UNKNOWN', 'HTML123', 'HTML4', 'XHTML', 'HTML5', 'MARKDOWN') NOT NULL,
    FEATURES INT COMMENT "Bit-encoded feature set of document, @see HtmlFeature" NOT NULL,

    DATA_HASH BIGINT NOT NULL,
    QUALITY DOUBLE NOT NULL,

    PUB_YEAR SMALLINT,

    FOREIGN KEY (ID) REFERENCES EC_URL(ID) ON DELETE CASCADE
)
CHARACTER SET utf8mb4
COLLATE utf8mb4_unicode_ci;

CREATE TABLE IF NOT EXISTS EC_DOMAIN_LINK (
    ID INT PRIMARY KEY AUTO_INCREMENT,
    SOURCE_DOMAIN_ID INT NOT NULL,
    DEST_DOMAIN_ID INT NOT NULL,

    CONSTRAINT CONS UNIQUE (SOURCE_DOMAIN_ID, DEST_DOMAIN_ID),

    FOREIGN KEY (SOURCE_DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE,
    FOREIGN KEY (DEST_DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE
);

CREATE TABLE IF NOT EXISTS DOMAIN_METADATA (
    ID INT PRIMARY KEY,
    KNOWN_URLS INT DEFAULT 0,
    VISITED_URLS INT DEFAULT 0,
    GOOD_URLS INT DEFAULT 0,

    FOREIGN KEY (ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE
);

CREATE TABLE EC_FEED_URL (
    URL VARCHAR(255) PRIMARY KEY,
    DOMAIN_ID INT,

    FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE
)
CHARACTER SET utf8mb4
COLLATE utf8mb4_unicode_ci;

CREATE OR REPLACE VIEW EC_URL_VIEW AS
    SELECT
        CONCAT(EC_URL.PROTO,
               '://',
               EC_DOMAIN.DOMAIN_NAME,
               IF(EC_URL.PORT IS NULL, '', CONCAT(':', EC_URL.PORT)),
               EC_URL.PATH,
               IF(EC_URL.PARAM IS NULL, '', CONCAT('?', EC_URL.PARAM))
               ) AS URL,
        EC_URL.PATH_HASH AS PATH_HASH,
        EC_URL.PATH AS PATH,
        EC_DOMAIN.DOMAIN_NAME AS DOMAIN_NAME,
        EC_DOMAIN.DOMAIN_TOP AS DOMAIN_TOP,
        EC_URL.ID AS ID,
        EC_DOMAIN.ID AS DOMAIN_ID,
        EC_URL.VISITED AS VISITED,
        EC_PAGE_DATA.QUALITY AS QUALITY,
        EC_PAGE_DATA.DATA_HASH AS DATA_HASH,
        EC_PAGE_DATA.TITLE AS TITLE,
        EC_PAGE_DATA.DESCRIPTION AS DESCRIPTION,
        EC_PAGE_DATA.WORDS_TOTAL AS WORDS_TOTAL,
        EC_PAGE_DATA.FORMAT AS FORMAT,
        EC_PAGE_DATA.FEATURES AS FEATURES,
        EC_DOMAIN.IP AS IP,
        EC_URL.STATE AS STATE,
        EC_DOMAIN.RANK AS RANK,
        EC_DOMAIN.STATE AS DOMAIN_STATE
    FROM EC_URL
    LEFT JOIN EC_PAGE_DATA
        ON EC_PAGE_DATA.ID = EC_URL.ID
    INNER JOIN EC_DOMAIN
        ON EC_URL.DOMAIN_ID = EC_DOMAIN.ID;


CREATE OR REPLACE VIEW EC_RELATED_LINKS_VIEW AS
    SELECT
        SOURCE_DOMAIN_ID,
        SOURCE_DOMAIN.DOMAIN_NAME AS SOURCE_DOMAIN,
        SOURCE_DOMAIN.DOMAIN_TOP AS SOURCE_TOP_DOMAIN,
        DEST_DOMAIN_ID,
        DEST_DOMAIN.DOMAIN_NAME AS DEST_DOMAIN,
        DEST_DOMAIN.DOMAIN_TOP AS DEST_TOP_DOMAIN
    FROM EC_DOMAIN_LINK
    INNER JOIN EC_DOMAIN AS SOURCE_DOMAIN
        ON SOURCE_DOMAIN.ID=SOURCE_DOMAIN_ID
    INNER JOIN EC_DOMAIN AS DEST_DOMAIN
        ON DEST_DOMAIN.ID=DEST_DOMAIN_ID
    ;

CREATE INDEX IF NOT EXISTS EC_DOMAIN_INDEXED_INDEX ON EC_DOMAIN (INDEXED);
CREATE INDEX IF NOT EXISTS EC_DOMAIN_TOP_DOMAIN ON EC_DOMAIN (DOMAIN_TOP);
