/*
 * Decompiled with CFR 0.152.
 */
package it.uniroma1.lcl.jlt.gigaword;

import edu.stanford.nlp.ling.WordLemmaTag;
import it.uniroma1.lcl.jlt.Configuration;
import it.uniroma1.lcl.jlt.gigaword.data.GigawordText;
import it.uniroma1.lcl.jlt.gigaword.iterator.GigawordDumpIterator;
import it.uniroma1.lcl.jlt.pipeline.stanford.StanfordSentence;
import it.uniroma1.lcl.jlt.util.CounterReader;
import it.uniroma1.lcl.jlt.util.CounterWriter;
import it.uniroma1.lcl.jlt.util.IntegerCounter;
import it.uniroma1.lcl.jlt.util.Language;
import java.io.File;
import java.io.IOException;
import java.util.HashSet;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;

public class Gigaword {
    private static Gigaword instance = null;
    private static IntegerCounter<String> gigawordWordlist = null;

    public static Gigaword getInstance() {
        if (instance == null) {
            instance = new Gigaword();
        }
        return instance;
    }

    public IntegerCounter<String> getGigawordWordlist() {
        return this.getGigawordWordlist(0, "N", true, false);
    }

    public IntegerCounter<String> getGigawordWordlist(int minFrequency, String tag, boolean fromIndex, boolean numberOfParticipatingSentences) {
        String gigawordWordIndexlistFilepath = Configuration.getInstance().getGigawordIndexWordlistFilepath();
        String gigawordWordDumplistFilepath = Configuration.getInstance().getGigawordDumpWordlistFilepath();
        if (gigawordWordlist != null) {
            return gigawordWordlist;
        }
        try {
            File fIndex = new File(gigawordWordIndexlistFilepath);
            gigawordWordlist = new IntegerCounter();
            CounterReader cr = new CounterReader();
            if (fromIndex) {
                if (!fIndex.exists()) {
                    new CounterWriter<String>(this.getIndexCounts(tag, numberOfParticipatingSentences)).writeTo(gigawordWordIndexlistFilepath, " ", minFrequency);
                }
                gigawordWordlist = cr.readFrom(gigawordWordIndexlistFilepath, " ", minFrequency);
            } else {
                File fDump = new File(gigawordWordDumplistFilepath);
                if (!fDump.exists()) {
                    new CounterWriter<String>(this.getDumpCounts(tag)).writeTo(gigawordWordDumplistFilepath, " ", minFrequency);
                }
                gigawordWordlist = cr.readFrom(gigawordWordDumplistFilepath, " ", minFrequency);
            }
        }
        catch (Exception e) {
            e.printStackTrace();
        }
        return gigawordWordlist;
    }

    public int getGigawordIndexFrequency(String word) {
        if (gigawordWordlist == null) {
            System.out.println("[Wordlist does not exist, use getGigawordWordlist() to obtain the wordlist]");
            return 0;
        }
        Integer freq = (Integer)gigawordWordlist.get(word.toLowerCase());
        if (freq == null) {
            return 0;
        }
        return freq;
    }

    private IntegerCounter<String> getIndexCounts(String tag, boolean numberOfParticipatingSentences) {
        IntegerCounter<String> cnt = new IntegerCounter<String>();
        HashSet<String> seenWords = new HashSet<String>();
        try {
            IndexReader idx = IndexReader.open((String)Configuration.getInstance().getGigawordSentenceCompoundIndexDirectory());
            int i = 0;
            while (i < idx.maxDoc()) {
                if (!idx.isDeleted(i)) {
                    seenWords.clear();
                    Document doc = idx.document(i);
                    String[] docId = doc.getValues("token");
                    String[] tags = doc.getValues("tag");
                    int index = 0;
                    while (index < docId.length) {
                        if (tags[index].startsWith(tag) && !seenWords.contains(docId[index].toLowerCase()) || !numberOfParticipatingSentences) {
                            cnt.count(docId[index].toLowerCase());
                            seenWords.add(docId[index].toLowerCase());
                        }
                        ++index;
                    }
                    if (i % 10000 == 0) {
                        System.out.println(String.valueOf(i) + " : " + cnt.size());
                    }
                }
                ++i;
            }
            idx.close();
        }
        catch (IOException e) {
            e.printStackTrace();
        }
        return cnt;
    }

    private IntegerCounter<String> getDumpCounts(String tag) {
        IntegerCounter<String> cnt = new IntegerCounter<String>();
        int counter = 0;
        try {
            GigawordDumpIterator gigadi = new GigawordDumpIterator();
            gigadi.open();
            while (gigadi.hasNext()) {
                GigawordText ukwt = gigadi.next();
                for (StanfordSentence ss : ukwt) {
                    StanfordSentence compounds = ss.getMultiwordSentence(StanfordSentence.MultiwordBelongingTo.WORDNET, Language.EN);
                    for (WordLemmaTag wlt : compounds.getWords()) {
                        if (!wlt.tag().startsWith(tag)) continue;
                        cnt.count(wlt.word().toLowerCase());
                    }
                    if (++counter % 10000 != 0) continue;
                    System.out.println(counter);
                }
            }
            gigadi.close();
        }
        catch (Exception e) {
            e.printStackTrace();
        }
        return cnt;
    }

    public static void main(String[] args) {
        Gigaword giga = new Gigaword();
        IntegerCounter<String> gigawordWordlist = giga.getGigawordWordlist(500, "N", true, true);
        System.out.println("WordList size: " + gigawordWordlist.size());
        System.out.println("Counts for checkup: " + gigawordWordlist.get("checkup"));
    }
}

