/*
 * Decompiled with CFR 0.152.
 */
package it.uniroma1.lcl.jlt.dev.gigaword;

import edu.stanford.nlp.ling.WordLemmaTag;
import it.uniroma1.lcl.jlt.Configuration;
import it.uniroma1.lcl.jlt.collocs.exceptions.UnimplementedMethodException;
import it.uniroma1.lcl.jlt.dev.gigaword.GigawordDocument;
import it.uniroma1.lcl.jlt.dev.gigaword.GigawordSentence;
import it.uniroma1.lcl.jlt.dev.util.condition.IntegerCondition;
import it.uniroma1.lcl.jlt.pipeline.stanford.DataProcessor;
import it.uniroma1.lcl.jlt.pipeline.stanford.StanfordSentence;
import it.uniroma1.lcl.jlt.util.Files;
import it.uniroma1.lcl.jlt.util.IntegerCounter;
import it.uniroma1.lcl.jlt.util.Language;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

public class GigawordIterator
implements Iterator<GigawordDocument> {
    private static Log log = LogFactory.getLog(GigawordIterator.class);
    private Iterator<File> fileIterator;
    private Iterator<GigawordDocument> textIterator;
    private int currentDocNumber = 0;
    private int totalDocNumber;

    public GigawordIterator() {
        this(GigaVersion.VERSION_5);
    }

    public GigawordIterator(GigaVersion version) {
        this(version.getPath());
    }

    public GigawordIterator(String directory) {
        log.info((Object)("Scanning directory " + directory + " looking for .gz files."));
        List<File> files = GigawordIterator.files(directory);
        this.totalDocNumber = files.size();
        log.info((Object)(String.valueOf(this.totalDocNumber) + " files detected"));
        this.fileIterator = files.iterator();
        this.parseNext();
    }

    public static List<File> files(String directory) {
        List<File> files = Files.listFiles(new File(directory), ".*\\.gz");
        Collections.sort(files, new Comparator<File>(){

            @Override
            public int compare(File o1, File o2) {
                return o1.getAbsolutePath().compareTo(o2.getAbsolutePath());
            }
        });
        return files;
    }

    @Override
    public boolean hasNext() {
        return this.textIterator.hasNext() || this.fileIterator.hasNext();
    }

    @Override
    public GigawordDocument next() {
        GigawordDocument text = this.textIterator.next();
        if (!this.textIterator.hasNext() && this.fileIterator.hasNext()) {
            this.parseNext();
        }
        return text;
    }

    protected void parseNext() {
        throw new Error("Unresolved compilation problems: \n\tThe method getTextContent() is undefined for the type Node\n\tThe method getTextContent() is undefined for the type Node\n\tThe method getTextContent() is undefined for the type Node\n");
    }

    @Override
    public void remove() {
        throw new UnimplementedMethodException();
    }

    public static void main(String[] args) throws IOException {
        int sentences = 0;
        int docs = 0;
        IntegerCondition docsCondition = new IntegerCondition(1000);
        docsCondition.setId("NUMBER DOCUMENTS");
        IntegerCondition sentencesCondition = new IntegerCondition(10000);
        sentencesCondition.setId("NUMBER SENTENCES");
        GigawordIterator giga = new GigawordIterator(GigaVersion.VERSION_1);
        IntegerCounter<String> wordCounter = new IntegerCounter<String>();
        while (giga.hasNext()) {
            GigawordDocument gigatext = giga.next();
            ++docs;
            docsCondition.triggerAndIncrement();
            List<Character> posTagList = Arrays.asList(Character.valueOf('N'), Character.valueOf('V'), Character.valueOf('J'));
            for (GigawordSentence sentence : gigatext) {
                ++sentences;
                sentencesCondition.triggerAndIncrement();
                List<WordLemmaTag> processedSentence = DataProcessor.getInstance().processSentence(sentence.getSentence(), false);
                StringBuffer sb = new StringBuffer();
                for (WordLemmaTag wlt : processedSentence) {
                    sb.append(wlt.word());
                    sb.append("/");
                    sb.append(wlt.lemma());
                    sb.append("/");
                    sb.append(wlt.tag());
                    sb.append(" ");
                }
                StanfordSentence processedStanfordSentence = StanfordSentence.fromLine(sb.toString());
                List<String> words = processedStanfordSentence.getTerms(posTagList, Language.EN, StanfordSentence.MultiwordBelongingTo.WORDNET, StanfordSentence.CompoundingParameter.ALLOW_MULTIWORD_EXPRESSIONS, StanfordSentence.CompoundingParameter.APPEND_POS);
                HashSet<String> wordsSet = new HashSet<String>(words);
                for (String word : wordsSet) {
                    wordCounter.count(word);
                }
            }
            if (docs % 1000 != 0) continue;
            wordCounter.saveToFile("output/giga.4.words.counts");
        }
        log.info((Object)("Total documents: " + docs));
        log.info((Object)("Total sentences: " + sentences));
    }

    public static enum GigaVersion {
        VERSION_1{

            @Override
            String getPath() {
                return Configuration.getInstance().getGigawordDirectory();
            }
        }
        ,
        VERSION_5{

            @Override
            String getPath() {
                return Configuration.getInstance().getGigaword5Directory();
            }
        };


        abstract String getPath();
    }
}

