using CSV
using DataFrames
using JuliaFormatter
using EzXML

#=
Extract language tags from ISO-639 datasets.

ISO-639 has four main levels:
- 639-1, with two-letter language tags.
- 639-2, with three-letter language tags for a broader coverage of languages than 639-1. Superset of 639-1.
- 639-3, with three-letter language tags with a comprehensive coverage of languages. Superset of 639-2.
- 639-5, with three-letter language tags for language families and groups. Superset of 639-2.

=#

function iso_639_1_639_3_map(info_639_3)
  tag_1 = String3[]
  tag_3 = String3[]
  for row in eachrow(info_639_3)
    ismissing(row.Part1) && continue
    push!(tag_1, row.Part1)
    push!(tag_3, row.Id)
  end
  tag_1, tag_3
end

names_639_3 = CSV.read(download("https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3_Name_Index.tab"), DataFrame; delim = "\t")
info_639_3 = CSV.read(download("https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3.tab"), DataFrame; delim = "\t")
macrolanguages = CSV.read(download("https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3-macrolanguages.tab"), DataFrame; delim = "\t")
names_639_1 = CSV.read(download("https://datahub.io/core/language-codes/r/language-codes.csv"), DataFrame; delim = ",")

# Scrape the OpenType language tag registry.

function generate_language_tag_registry(htmlfile)
  # Ignore parsing errors.
  htmldoc = redirect_stderr(() -> readhtml(htmlfile), devnull)
  htmlroot = htmldoc.root
  registry_table = findfirst("//table", htmlroot)
  nodes = findall("./tbody/tr", registry_table)
  @assert length(nodes) > 600

  registry = DataFrame("Name" => String[], "Tag" => String7[], "ISO_639" => String[])
  for node in nodes[2:end]
    td = node.firstelement
    tag = td.nextelement.content
    endswith(tag, "(deprecated)") && continue
    push!(registry, (td.content, strip(tag, '''), td.nextelement.nextelement.content))
  end
  registry
end

function iso_639_3_opentype_map(registry)
  duplicated_tags = String7[]
  iso_tags = String7[]
  for (tag, iso) in zip(registry.Tag, registry.ISO_639)
    isos = filter(!isempty, split(iso, ", "))
    tag == "MONT" && (isos[end] = isos[end][1:3])
    @assert all(==(3) ∘ length, isos) isos
    for iso_tag in isos
      push!(duplicated_tags, tag)
      push!(iso_tags, String7(iso_tag))
    end
  end
  iso_tags, duplicated_tags
end

language_registry = generate_language_tag_registry(download("https://learn.microsoft.com/en-us/typography/opentype/spec/languagetags"))

# Scrape the OpenType script tag registry.

function generate_script_tag_registry(htmlfile)
  # Ignore parsing errors.
  htmldoc = redirect_stderr(() -> readhtml(htmlfile), devnull)
  htmlroot = htmldoc.root
  registry_table = findfirst("//table", htmlroot)
  nodes = findall("./tbody/tr", registry_table)
  @assert length(nodes) > 170

  registry = DataFrame("Name" => String[], "Tag" => String7[])
  for node in nodes[2:end]
    td = node.firstelement
    name = td.content
    tag = startswith(name, "Yi") ? "yi  " : strip(td.nextelement.content, ''')
    push!(registry, (name, tag))
  end
  registry
end

script_registry = generate_script_tag_registry(download("https://learn.microsoft.com/en-us/typography/opentype/spec/scripttags"))

# Dump the relevant content as dictionaries for use in OpenType.jl.

as_tag(str, n) = Expr(:macrocall, Symbol("@tag", n, "_str"), nothing, str)

function make_dict(tags, values, n)
  tag_exs = as_tag.(tags, n)
  :(Dict($((:($tag => $value) for (tag, value) in zip(tag_exs, values))...)))
end

names_dict_639_3 = make_dict(names_639_3.Id, names_639_3.Print_Name, 3)
individual_to_macro = make_dict(macrolanguages.I_Id, as_tag.(macrolanguages.M_Id, 3), 3)
names_dict_639_1 = make_dict(names_639_1.alpha2, names_639_1.English, 2)
names_dict_opentype = make_dict(language_registry.Tag, language_registry.Name, 4) 

from, to = iso_639_3_opentype_map(language_registry)
tags_639_to_opentype = make_dict(from, as_tag.(to, 4), 3)
from, to = iso_639_1_639_3_map(info_639_3)
tags_639_1_to_639_3 = make_dict(from, as_tag.(to, 3), 2)

script_dict_opentype = make_dict(script_registry.Tag, script_registry.Name, 4)

target = joinpath(dirname(@__DIR__), "src", "generated", "tags.jl")

open(target, "w") do io
  println(io, "# This file was generated by $(@__FILE__)")
  println(io)
  println(io, :(const macrolanguages_ISO_639_3 = $individual_to_macro))
  println(io)
  println(io, :(const language_tag_names_ISO_639_3 = $names_dict_639_3))
  println(io)
  println(io, :(const language_tag_names_ISO_639_1 = $names_dict_639_1))
  println(io)
  println(io, :(const language_tag_names_opentype = $names_dict_opentype))
  println(io)
  println(io, :(const language_tags_ISO_639_3_to_opentype = $tags_639_to_opentype))
  println(io)
  println(io, :(const language_tags_ISO_639_1_ISO_639_3 = $tags_639_1_to_639_3))
  println(io)
  println(io, :(const script_tags_opentype = $script_dict_opentype))
end

format(target)
