; The Tofita Engine
; Copyright (C) 2021-2023 Oleh Petrenko
;
; This program is free software: you can redistribute it and/or modify
; it under the terms of the GNU Lesser General Public License as published by
; the Free Software Foundation, version 3 of the License.
;
; This program is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; GNU Lesser General Public License for more details.
;
; You should have received a copy of the GNU Lesser General Public License
; along with this program.  If not, see <https://www.gnu.org/licenses/>.

; Based on Redox OS trampoline for bringing up APs (x86_64/trampoline.asm)

; This code is used to start SMP cores (AP)
; Sets GDT, CPU features, paging, then enters 64-bit mode & jumps to the engine
default abs ; All addresses are relative to 0x00008000
org 0x00008000 ; Known absolute address in NON-virtual memory
section .head ; Required for proper linking, but no sections actually created

; Pointer-sized padding
%define PADDING 8

; Make types unambiguous
%define uint8_t db
%define uint16_t dw
%define uint32_t dd
%define uint64_t dq

; Entry point
; Not exported: entered by absolute address
use16
arguments:
    ; Disable interrupts
    cli
    ; Avoid interpreting arguments by the CPU
    jmp short realModeApStart

    ; Known padding
    ; (this code makes `arguments` "function" to be exactly PADDING bytes of size)
    ; (this way position of .cpuIndex is easy to calculate by fixed offset)
    times PADDING - ($ - arguments) nop

    ; This memory overwitten directly as a way to pass parameters
    .padding: uint64_t 0 ; TODO Unused
    .cpuIndex: uint64_t 0 ; Current CPU core number
    .pageTable: uint32_t 0 ; PML4
    .unused: uint32_t 0 ; TODO Unused
    .stackStart: uint64_t 0 ; TODO Unused
    .stackEnd: uint64_t 0 ; Stack used by CRT
    .code: uint64_t 0 ; Entry point

%define SYS_CODE64_SEL 0x10
%define SYS_DATA32_SEL 0x18

; Actual AP entry point body
use16
realModeApStart:
	; TODO cld? Clear direction flag just in case
    ; Segment selectors
    xor ax, ax
    mov ds, ax
    mov es, ax
    mov ss, ax

    ; Stack pointer later set in rsp
    mov sp, 0

    ; Physical pointer to PML4
    ; At this point only 32-bit thus some workarounds required
    mov edi, [arguments.pageTable]
    mov cr3, edi

    ; Enable FPU
    mov eax, cr0
    and al, 11110011b ; Clear task switched (3) and emulation (2)
    or al, 00100010b ; Set numeric error (5) monitor co-processor (1)
    mov cr0, eax

    ; 18: Enable OSXSAVE
    ; 10: Unmasked SSE exceptions
    ; 9: FXSAVE/FXRSTOR
    ; 7: Page Global
    ; 5: Page Address Extension
    ; 4: Page Size Extension
    mov eax, cr4
    or eax, 1 << 18 | 1 << 10 | 1 << 9 | 1 << 7 | 1 << 5 | 1 << 4
    mov cr4, eax

    ; Initialize floating point registers
    fninit

    ; Load protected mode GDT
    lgdt [gdtr]

    ; Read from the EFER MSR
    mov ecx, 0xC0000080
    rdmsr
    ; Set the Long-Mode-Enable and NXE bit
    or eax, 1 << 11 | 1 << 8
    wrmsr

    ; Enable paging and protection simultaneously
    mov ebx, cr0
    ; 31: Paging
    ; 16: Write protect engine
    ; 0: Protected Mode
    or ebx, 1 << 31 | 1 << 16 | 1
    mov cr0, ebx

    ; Far jump to enable Long Mode and load CS with 64 bit segment
    jmp SYS_CODE64_SEL:longModeApStart

; Now registers and pointers are 64-bit
use64
longModeApStart:
    mov rax, SYS_DATA32_SEL
    ; TODO Separate segments for executable memory and data
    mov ds, rax
    mov es, rax
    mov fs, rax
    mov gs, rax
    mov ss, rax

    mov rcx, [arguments.stackEnd]
    lea rsp, [rcx - 256] ; TODO proper stack start

    ; TODO handle ABI properly
    mov rdi, arguments.cpuIndex

    push 0 ; Signal end of stack with 0 return address
    push 0 ; and a few extra entries in case of stack
    push 0 ; problems
    push 0
    mov rbp, rsp ; Frame

    mov rax, [arguments.code]
    o64 call rax

gdtr:
    .size uint16_t 127
    .offset uint64_t gdtTemplate

; TODO copy actual template in realtime and set segment selectors
gdtTemplate:
    uint32_t 0x00000000
    uint32_t 0x00000000
    uint32_t 0x00000000
    uint32_t 0x00000000
    uint32_t 0x00000000
    uint32_t 0x00209b00
    uint32_t 0x0000ffff
    uint32_t 0x00cf9300
    uint32_t 0x0000ffff
    uint32_t 0x00cffa00
    uint32_t 0x0000ffff
    uint32_t 0x00cff300
    uint32_t 0x00000000
    uint32_t 0x0020fb00
    uint32_t 0x00000000
    uint32_t 0x00000000
    uint32_t 0x60800067
    uint32_t 0x00008bb9
    uint32_t 0xfffff800
    uint32_t 0x00000000
    uint32_t 0xe0003c00
    uint32_t 0xff40f3fa
    uint32_t 0x00000000
    uint32_t 0x00000000
    uint32_t 0x0000ffff
    uint32_t 0x00cf9a00
    uint32_t 0x00000000
    uint32_t 0x00000000
    uint32_t 0x00000000
    uint32_t 0x00000000
    uint32_t 0x00000000
    uint32_t 0x00000000
