/*
 * Decompiled with CFR 0.152.
 */
package it.uniroma1.lcl.jlt.gigaword.iterator;

import edu.stanford.nlp.ling.WordLemmaTag;
import it.uniroma1.lcl.jlt.Configuration;
import it.uniroma1.lcl.jlt.Constants;
import it.uniroma1.lcl.jlt.gigaword.data.GigawordText;
import it.uniroma1.lcl.jlt.pipeline.stanford.StanfordSentence;
import it.uniroma1.lcl.jlt.util.Files;
import it.uniroma1.lcl.jlt.util.Strings;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.zip.GZIPInputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

public class GigawordDumpIterator
implements Iterator<GigawordText> {
    private static final Log log = LogFactory.getLog(GigawordDumpIterator.class);
    private static final boolean DEBUG = true;
    private String dumpFileDir = Configuration.getInstance().getGigawordTaggedDirectory();
    private InputStreamReader xover;
    private BufferedReader brXMLDump;
    private List<String> dumpFiles = new ArrayList<String>();
    private Iterator<String> dumpFileIterator;
    private String nextDumpFile;
    private GigawordText nextText;

    public GigawordDumpIterator() {
        List<String> directories = Configuration.getInstance().getGigawordSubdirectories();
        log.debug((Object)"Files to be parsed: ");
        for (String subdir : directories) {
            String[] stringArray = new File(String.valueOf(this.dumpFileDir) + "/" + subdir).list();
            int n = stringArray.length;
            int n2 = 0;
            while (n2 < n) {
                String fileName = stringArray[n2];
                if (fileName.endsWith(".pos.gz")) {
                    this.dumpFiles.add(String.valueOf(subdir) + "/" + fileName);
                    log.debug((Object)(String.valueOf(fileName) + "\t"));
                }
                ++n2;
            }
        }
        log.debug((Object)"\n");
        this.dumpFileIterator = this.dumpFiles.iterator();
        this.nextText = null;
    }

    public void open() throws IOException {
        this.openNext();
    }

    protected void openNext() throws IOException {
        if (this.brXMLDump != null) {
            this.close();
        }
        if (!this.dumpFileIterator.hasNext()) {
            return;
        }
        String next = this.dumpFileIterator.next();
        this.nextDumpFile = String.valueOf(this.dumpFileDir) + "/" + next;
        log.info((Object)(String.valueOf(this.dumpFiles.indexOf(next)) + "/" + this.dumpFiles.size() + " (" + Strings.getPercentage(this.dumpFiles.indexOf(next), this.dumpFiles.size()) + ") " + "Opening: " + this.nextDumpFile));
        FileInputStream fin = new FileInputStream(this.nextDumpFile);
        GZIPInputStream gzis = new GZIPInputStream(fin);
        this.xover = new InputStreamReader((InputStream)gzis, "UTF-8");
        this.brXMLDump = new BufferedReader(this.xover, Configuration.getInstance().getBufferSizeReader() * Constants.KBYTE);
    }

    public void close() throws IOException {
        this.brXMLDump.close();
        this.xover.close();
    }

    @Override
    public boolean hasNext() {
        if (this.nextText == null) {
            this.nextText = this.next();
        }
        return this.nextText != null;
    }

    /*
     * Enabled aggressive block sorting
     * Enabled unnecessary exception pruning
     * Enabled aggressive exception aggregation
     */
    @Override
    public GigawordText next() {
        if (this.nextText != null) {
            GigawordText next = this.nextText;
            this.nextText = null;
            return next;
        }
        GigawordText text = null;
        try {
            if (this.brXMLDump == null || !this.brXMLDump.ready()) {
                if (!this.dumpFileIterator.hasNext()) {
                    return null;
                }
                this.openNext();
            }
            String textLine = this.brXMLDump.readLine();
            text = new GigawordText(Files.getFileName(this.nextDumpFile));
            StanfordSentence sentence = new StanfordSentence();
            if (textLine != null) {
                try {
                    String[] stringArray = textLine.split(" ");
                    int n = stringArray.length;
                    int n2 = 0;
                    while (n2 < n) {
                        String token = stringArray[n2];
                        String[] triple = token.split("\\|");
                        String word = triple[0];
                        String tag = triple[1].isEmpty() && triple.length > 2 ? triple[2] : triple[1];
                        String lemma = triple[0];
                        if (word.length() > 0 && Character.isLetter(word.charAt(0))) {
                            sentence.addWord(new WordLemmaTag(word, lemma, tag));
                        }
                        ++n2;
                    }
                }
                catch (Exception e) {
                    log.info((Object)("PROBLEM: a problem with the formatting of the data, at file " + this.nextDumpFile + "\n"));
                    log.info((Object)(String.valueOf(textLine) + "\n"));
                    e.printStackTrace();
                }
            }
            text.addSentence(sentence);
            return text;
        }
        catch (IOException e) {
            e.printStackTrace();
        }
        return text;
    }

    @Override
    public void remove() {
        throw new RuntimeException("Unsupported operation 'remove'");
    }

    public static void main(String[] args) throws IOException {
        int counter = 0;
        GigawordDumpIterator gigadi = new GigawordDumpIterator();
        gigadi.open();
        while (gigadi.hasNext()) {
            GigawordText ukwt = gigadi.next();
            for (StanfordSentence ss : ukwt) {
                for (WordLemmaTag wlt : ss.getWords()) {
                    System.out.print(String.valueOf(wlt.word()) + "/" + wlt.tag() + " ");
                }
                System.out.println();
                if (++counter % 100000 != 0) continue;
                System.out.println(counter);
            }
        }
        gigadi.close();
    }
}

