/*
 * Decompiled with CFR 0.152.
 */
package it.uniroma1.lcl.jlt.gigaword.index;

import edu.stanford.nlp.ling.WordLemmaTag;
import it.uniroma1.lcl.jlt.Configuration;
import it.uniroma1.lcl.jlt.gigaword.data.GigawordText;
import it.uniroma1.lcl.jlt.gigaword.iterator.GigawordDumpIterator;
import it.uniroma1.lcl.jlt.pipeline.stanford.StanfordSentence;
import it.uniroma1.lcl.jlt.util.Language;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;

public class GigawordSentenceCompoundIndexer {
    private static final int LOG_NUM_PAGES = 1000;
    protected IndexWriter writer = null;
    protected Document currentDoc = null;

    public GigawordSentenceCompoundIndexer() {
        try {
            SimpleFSDirectory dir = new SimpleFSDirectory(new File(Configuration.getInstance().getGigawordSentenceCompoundIndexDirectory()));
            this.writer = new IndexWriter((Directory)dir, (Analyzer)new StandardAnalyzer(Version.LUCENE_29), true, IndexWriter.MaxFieldLength.UNLIMITED);
            this.writer.setMaxMergeDocs(Configuration.getInstance().getMaxMergeDocs());
        }
        catch (IOException e) {
            e.printStackTrace();
        }
    }

    public void index() {
        this.index(0, 0);
    }

    public void index(int minimumSentenceLength) {
        this.index(0, minimumSentenceLength);
    }

    public void index(int howManySentences, int minimumSentenceLength) {
        int SentenceID = 1;
        GigawordDumpIterator gigadi = new GigawordDumpIterator();
        try {
            gigadi.open();
            while (gigadi.hasNext() && (SentenceID < howManySentences || howManySentences == 0)) {
                GigawordText gigat = gigadi.next();
                Iterator<StanfordSentence> iterator = gigat.iterator();
                while (iterator.hasNext() && (SentenceID < howManySentences || howManySentences == 0)) {
                    StanfordSentence ss = iterator.next();
                    if (ss.getWords().size() <= minimumSentenceLength) continue;
                    ++SentenceID;
                    this.currentDoc = new Document();
                    StanfordSentence compounds = ss.getMultiwordSentence(StanfordSentence.MultiwordBelongingTo.WORDNET, Language.EN);
                    for (WordLemmaTag wlt : compounds) {
                        this.currentDoc.add((Fieldable)new Field("token", wlt.word().toLowerCase(), Field.Store.YES, Field.Index.NOT_ANALYZED));
                        this.currentDoc.add((Fieldable)new Field("tag", wlt.tag(), Field.Store.YES, Field.Index.NOT_ANALYZED));
                    }
                    this.currentDoc.add((Fieldable)new Field("sID", Integer.toString(SentenceID), Field.Store.YES, Field.Index.NOT_ANALYZED));
                    this.writer.addDocument(this.currentDoc);
                    if (SentenceID % 1000 != 0) continue;
                    System.out.println("Sentence id: " + SentenceID);
                }
            }
            this.writer.optimize();
            this.writer.close();
            gigadi.close();
        }
        catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static void main(String[] args) throws ParseException, IOException {
        System.out.println("::Indexing source sentences: " + Configuration.getInstance().getGigawordTaggedDirectory());
        long start = System.currentTimeMillis();
        GigawordSentenceCompoundIndexer xtd = new GigawordSentenceCompoundIndexer();
        xtd.index(100, 10);
        long end = System.currentTimeMillis();
        System.out.println("::Terminated in " + (double)(end - start) / 1000.0 + " sec");
    }
}

