/*
 * Decompiled with CFR 0.152.
 */
package it.uniroma1.lcl.jlt.wiki;

import it.uniroma1.lcl.jlt.Configuration;
import it.uniroma1.lcl.jlt.util.Language;
import it.uniroma1.lcl.jlt.util.PennTokenizer;
import it.uniroma1.lcl.jlt.util.Stopwords;
import it.uniroma1.lcl.jlt.util.Strings;
import it.uniroma1.lcl.jlt.wiki.WikiParsingUtility;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;

public class WikiTokenizer {
    private final String id;
    private String testo;
    private final Language language;

    public WikiTokenizer(String text, String id) throws IOException {
        this(text, id, null);
    }

    public WikiTokenizer(String text, String id, Language language) throws IOException {
        this.id = id;
        this.testo = text;
        this.language = language == null ? Language.EN : language;
    }

    public boolean isTag(String s) {
        return (s.contains("&") || s.contains("refgt") || s.contains("quotgt") || s.contains("supgt") || s.contains("divgt") || s.equals("quot") || s.equals("IsLiNk") || s.contains("~") || s.contains("ISBN") || s.equals("ref") || s.equals("lt") || s.equals("ur") || s.equals("gt") || s.equals("il") || s.equals("li") || s.equals("ur") || s.equals("bt") || s.startsWith("|") || s.equals("") || s.equals("div") || s.equals("'") || s.contains("sup") || s.contains("//www") || (s.contains("/") || s.contains("=") || s.contains("|") || s.contains("'") || s.contains("<") || s.contains(">")) && !s.contains("IsLiNk")) && !s.matches("'([sSmMdD])") && !s.matches("'(ll|re|ve)");
    }

    public static boolean isNamespace(String s) {
        return s.matches(".*[^ \\t_]:[^ \\t_].*");
    }

    public boolean isCategory(String s) {
        if (s == null) {
            return false;
        }
        String categoryPrefix = Configuration.getInstance().getCategoryPrefix();
        return s.startsWith(categoryPrefix) || s.startsWith(categoryPrefix.toLowerCase());
    }

    public String getText() {
        return this.getText(true);
    }

    public String getText(boolean processForIndex) {
        if (processForIndex && this.language == null) {
            throw new RuntimeException("Need a language to process for index!");
        }
        Stopwords stopwords = null;
        if (this.language != null) {
            stopwords = Stopwords.getInstance(this.language);
        }
        String redirezione = WikiParsingUtility.redirectionPage(this.testo);
        boolean bStopword = true;
        if (this.testo.startsWith("#redirect") || this.testo.startsWith("#REDIRECT") || this.testo.startsWith("#Redirect")) {
            if (redirezione != null) {
                return redirezione;
            }
            return null;
        }
        StringBuffer sb = new StringBuffer();
        this.testo = WikiParsingUtility.risolviEntities(this.testo);
        this.testo = WikiParsingUtility.rimuoviTestoInutile(this.testo);
        this.testo = WikiParsingUtility.rimuoviNewLine(this.testo);
        this.testo = WikiParsingUtility.rimuoviDoppieQuadre(this.testo);
        this.testo = WikiParsingUtility.rimuoviParentesi(this.testo);
        this.testo = this.testo.replaceAll("\\\\", "");
        String[] stringArray = this.testo.split("\\s");
        int n = stringArray.length;
        int n2 = 0;
        while (n2 < n) {
            String s = stringArray[n2];
            if (processForIndex) {
                s = s.contains("IsLiNk") ? WikiParsingUtility.rimuoviPunteggiaturaLink(s) : WikiParsingUtility.rimuoviPunteggiatura(s);
            }
            if (!(this.isTag(s) || WikiTokenizer.isNamespace(s) || processForIndex && !Strings.isAlphaNum(s))) {
                if (s.endsWith("IsLiNk") || s.endsWith("IsSpAcE")) {
                    System.out.println("***ERRORE IS LINK***, " + s + "," + this.id);
                } else if (stopwords != null && processForIndex && stopwords.isStopword(s)) {
                    if (!bStopword) {
                        sb.append("#\n");
                    }
                    bStopword = true;
                } else {
                    if (processForIndex) {
                        sb.append(String.valueOf(s) + "\n");
                    } else {
                        sb.append(String.valueOf(s) + " ");
                    }
                    bStopword = false;
                }
            }
            ++n2;
        }
        String result = sb.toString();
        if (!processForIndex && this.language == Language.EN) {
            result = PennTokenizer.tokenize(result);
        }
        return result;
    }

    public Set<String> getCategoryLemmas() {
        int idx;
        HashSet<String> categoryLemmas = new HashSet<String>();
        if (this.language == null || this.language != Language.EN) {
            return categoryLemmas;
        }
        int idx2 = 0;
        int idx3 = 0;
        String categoryPrefix = Configuration.getInstance().getCategoryPrefix();
        int prefixLength = categoryPrefix.length();
        while ((idx = this.testo.indexOf("[[" + categoryPrefix, idx2)) != -1) {
            idx2 = this.testo.indexOf("]", idx + 2 + prefixLength);
            idx3 = this.testo.indexOf("\n", idx + 2 + prefixLength);
            if (idx3 != -1 && idx2 > idx3) {
                idx2 = idx3;
            }
            if (idx2 == -1) {
                return categoryLemmas;
            }
            String category = this.testo.substring(idx + 2 + prefixLength, idx2);
            int idxPipe = category.indexOf(124);
            if (idxPipe != -1) {
                category = category.substring(0, idxPipe);
            }
            categoryLemmas.add(category);
        }
        return categoryLemmas;
    }

    public Set<String> getFullCategoriesName() {
        int idx;
        HashSet<String> categories = new HashSet<String>();
        int idx2 = 0;
        int idx3 = 0;
        String categoryPrefix = Configuration.getInstance().getCategoryPrefix();
        int prefixLength = categoryPrefix.length();
        while ((idx = this.testo.indexOf("[[" + categoryPrefix, idx2)) != -1) {
            idx2 = this.testo.indexOf("]", idx + 2 + prefixLength);
            idx3 = this.testo.indexOf("\n", idx + 2 + prefixLength);
            if (idx3 != -1 && idx2 > idx3) {
                idx2 = idx3;
            }
            if (idx2 == -1) {
                return categories;
            }
            String categoryName = this.testo.substring(idx + 2 + prefixLength, idx2);
            int idxPipe = categoryName.indexOf(124);
            if (idxPipe != -1) {
                categoryName = categoryName.substring(0, idxPipe);
            }
            categories.add(categoryName);
        }
        return categories;
    }

    public Set<String> getTranslationLinks() {
        int idx = 0;
        HashSet<String> translations = new HashSet<String>();
        while ((idx = this.testo.indexOf("[[", idx)) != -1) {
            String link;
            int idxColon;
            int idx2 = this.testo.indexOf("]", idx += 2);
            int idx3 = this.testo.indexOf("\n", idx);
            if (idx3 != -1 && idx2 > idx3) {
                idx2 = idx3;
            }
            if (idx2 == -1 || (idxColon = (link = this.testo.substring(idx, idx2)).indexOf(58)) == -1) continue;
            try {
                Language.valueOf(link.substring(0, idxColon).toUpperCase());
                String translation = this.testo.substring(idx, idx2).trim();
                translations.add(translation);
            }
            catch (IllegalArgumentException illegalArgumentException) {}
        }
        return translations;
    }

    public static void main(String[] args) throws IOException {
        System.out.println(new WikiTokenizer("from:1972 till:1990 color:PA text:\"[[]]Valentin Bauer, CSU ([[1972]]-[[1990]])\"    bla bla [[bla bla]] [[]] [   ] [[it:italia]]  [[  ccef:ere ]] [[en:Italy]]  [[abc:]] : [[fr:Italie]]xxxx.", "id").getText(false));
    }
}

