/*
 * Decompiled with CFR 0.152.
 */
package it.uniroma1.lcl.jlt.ukwac.index;

import edu.stanford.nlp.ling.WordLemmaTag;
import it.uniroma1.lcl.jlt.Configuration;
import it.uniroma1.lcl.jlt.pipeline.stanford.StanfordSentence;
import it.uniroma1.lcl.jlt.ukwac.data.UKWacText;
import it.uniroma1.lcl.jlt.ukwac.iterator.UKWacDumpIterator;
import it.uniroma1.lcl.jlt.util.Language;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;

public class UKWacSentenceCompoundIndexer {
    private static final int LOG_NUM_PAGES = 1000;
    protected IndexWriter writer = null;
    protected Document currentDoc = null;

    public UKWacSentenceCompoundIndexer() {
        try {
            SimpleFSDirectory dir = new SimpleFSDirectory(new File(Configuration.getInstance().getUKWacCompoundSentenceIndexDirectory()));
            this.writer = new IndexWriter((Directory)dir, (Analyzer)new StandardAnalyzer(Version.LUCENE_29), true, IndexWriter.MaxFieldLength.UNLIMITED);
            this.writer.setMaxMergeDocs(Configuration.getInstance().getMaxMergeDocs());
        }
        catch (IOException e) {
            e.printStackTrace();
        }
    }

    public void index() throws IOException, ParseException {
        this.index(0, 0);
    }

    public void index(int howManySentences, int minimumSentenceLength) throws IOException, ParseException {
        int SentenceID = 1;
        UKWacDumpIterator ukwdi = new UKWacDumpIterator();
        ukwdi.open();
        while (ukwdi.hasNext() && (SentenceID < howManySentences || howManySentences == 0)) {
            UKWacText ukwt = ukwdi.next();
            Iterator<StanfordSentence> iterator = ukwt.iterator();
            while (iterator.hasNext() && (SentenceID < howManySentences || howManySentences == 0)) {
                StanfordSentence ss = iterator.next();
                if (ss.getWords().size() <= minimumSentenceLength) continue;
                ++SentenceID;
                this.currentDoc = new Document();
                StanfordSentence compounds = ss.getMultiwordSentence(StanfordSentence.MultiwordBelongingTo.WORDNET, Language.EN);
                for (WordLemmaTag wlt : compounds) {
                    this.currentDoc.add((Fieldable)new Field("token", wlt.word(), Field.Store.YES, Field.Index.NOT_ANALYZED));
                    this.currentDoc.add((Fieldable)new Field("lemma", wlt.lemma(), Field.Store.YES, Field.Index.NOT_ANALYZED));
                    this.currentDoc.add((Fieldable)new Field("tag", wlt.tag(), Field.Store.YES, Field.Index.NOT_ANALYZED));
                }
                this.currentDoc.add((Fieldable)new Field("sID", Integer.toString(SentenceID), Field.Store.YES, Field.Index.NOT_ANALYZED));
                this.writer.addDocument(this.currentDoc);
                if (SentenceID % 1000 != 0) continue;
                System.out.println("Sentence id: " + SentenceID);
            }
        }
        this.writer.optimize();
        this.writer.close();
        ukwdi.close();
    }

    public static void main(String[] args) throws ParseException, IOException {
        System.out.println("::Indexing source sentences: " + Configuration.getInstance().getUKWacDirectory());
        long start = System.currentTimeMillis();
        UKWacSentenceCompoundIndexer xtd = new UKWacSentenceCompoundIndexer();
        xtd.index(0, 10);
        long end = System.currentTimeMillis();
        System.out.println("::Terminated in " + (double)(end - start) / 1000.0 + " sec");
    }
}

