/*
 * Decompiled with CFR 0.152.
 */
package it.uniroma1.lcl.jlt.wiktionary;

import it.uniroma1.lcl.jlt.Configuration;
import it.uniroma1.lcl.jlt.util.Files;
import it.uniroma1.lcl.jlt.util.HtmlDecoder;
import it.uniroma1.lcl.jlt.util.Strings;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;

public class WiktionaryIndexer {
    private IndexWriter writer = null;

    private void createGlossa(String term, String gloss) throws CorruptIndexException, IOException {
        while (gloss.contains("[[")) {
            int idxS = gloss.indexOf("[[");
            int idxE = gloss.indexOf("]]");
            if (idxS < 0 || idxE < 0) break;
            try {
                String link = gloss.substring(idxS + 2, idxE);
                String[] links = link.split("\\|");
                gloss = String.valueOf(gloss.substring(0, idxS)) + links[links.length - 1] + gloss.substring(idxE + 2);
            }
            catch (Exception e) {
                e.printStackTrace();
            }
        }
        gloss = gloss.toLowerCase().replace("|", " ").replace("{{", "(").replace("}}", ")").replace("[", "").replace("]", "").replace("'''", "").replace("'''", "");
        gloss = gloss.replace(". .", " .");
        gloss = HtmlDecoder.getInstance().unescapeHTML(gloss);
        System.out.println(String.valueOf(term) + ": " + gloss);
        Document currentDoc = new Document();
        currentDoc.add((Fieldable)new Field("lemma", term, Field.Store.YES, Field.Index.NOT_ANALYZED));
        currentDoc.add((Fieldable)new Field("gloss", gloss, Field.Store.YES, Field.Index.NOT_ANALYZED));
        this.writer.addDocument(currentDoc);
    }

    private void createIndex(String indexfile, String dumpfile) throws IOException {
        SimpleFSDirectory dir = new SimpleFSDirectory(new File(indexfile));
        this.writer = new IndexWriter((Directory)dir, (Analyzer)new StandardAnalyzer(Version.LUCENE_CURRENT), true, IndexWriter.MaxFieldLength.UNLIMITED);
        this.writer.setMaxMergeDocs(Configuration.getInstance().getMaxMergeDocs());
        BufferedReader br = Files.getBufferedReader(dumpfile);
        int numberOfTerms = 0;
        int numberOfGloss = 0;
        boolean isNoun = false;
        boolean isEnglish = false;
        String word = "";
        ArrayList<String> definitions = new ArrayList<String>();
        while (br.ready()) {
            String line = br.readLine();
            if (line.contains("==English==")) {
                isEnglish = true;
            }
            if (isNoun && line.startsWith("===")) {
                isNoun = false;
                isEnglish = false;
                continue;
            }
            if (isNoun && line.startsWith("# ")) {
                definitions.add(line.substring(2));
            }
            if (line.contains("<page>")) {
                isEnglish = false;
                continue;
            }
            if (line.contains("</page>")) {
                if (definitions.size() != 0) {
                    ++numberOfTerms;
                    numberOfGloss += definitions.size();
                    for (String gloss : definitions) {
                        int idx;
                        String tag = "";
                        if ((gloss = gloss.replace("</text>", "")).isEmpty()) continue;
                        if (gloss.startsWith("{{") && (idx = gloss.indexOf("}}")) != -1) {
                            tag = " " + gloss.substring(0, idx + 2).trim();
                            gloss = gloss.substring(idx + 2).trim();
                        }
                        if (gloss.isEmpty() || gloss.toLowerCase().startsWith("by ")) continue;
                        if (!(gloss.toLowerCase().startsWith("a ") || gloss.toLowerCase().startsWith("any ") || gloss.toLowerCase().startsWith("one ") || gloss.toLowerCase().startsWith("two ") || gloss.toLowerCase().startsWith("either ") || gloss.toLowerCase().startsWith("both ") || gloss.toLowerCase().startsWith("an ") || gloss.toLowerCase().startsWith("the "))) {
                            if (Strings.isVowel(gloss.charAt(0))) {
                                this.createGlossa(word, "a " + word + tag + " is an " + gloss + " .");
                                continue;
                            }
                            this.createGlossa(word, "a " + word + tag + " is a " + gloss + " .");
                            continue;
                        }
                        if (gloss.toLowerCase().startsWith("art of ")) {
                            this.createGlossa(word, "a " + word + tag + " is the " + gloss + " .");
                            continue;
                        }
                        this.createGlossa(word, "a " + word + tag + " is " + gloss + " .");
                    }
                }
                isNoun = false;
                definitions.clear();
                word = "";
                continue;
            }
            if (line.contains("<title>")) {
                isNoun = false;
                word = line.trim().replaceAll("<title>", "").replaceAll("</title>", "");
                continue;
            }
            if (!line.contains("===Noun===") || !isEnglish) continue;
            isNoun = true;
        }
        System.out.println("# TERMS: " + numberOfTerms);
        System.out.println("# GLOSS: " + numberOfGloss);
        br.close();
        this.writer.optimize();
        this.writer.close();
    }

    public static void main(String[] args) {
        try {
            WiktionaryIndexer wi = new WiktionaryIndexer();
            wi.createIndex(Configuration.getInstance().getWiktionaryIndex(), Configuration.getInstance().getWiktionaryDump());
        }
        catch (Exception e) {
            e.printStackTrace();
        }
    }
}

