/*
 * Decompiled with CFR 0.152.
 */
package it.uniroma1.lcl.jlt.wiki.iterator;

import it.uniroma1.lcl.jlt.Configuration;
import it.uniroma1.lcl.jlt.Constants;
import it.uniroma1.lcl.jlt.util.Collections;
import it.uniroma1.lcl.jlt.util.Strings;
import it.uniroma1.lcl.jlt.wiki.WikiTokenizer;
import it.uniroma1.lcl.jlt.wiki.data.WikiPageInfo;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Iterator;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

public class WikiDumpIterator
implements Iterator<WikiPageInfo> {
    private static final String XML_END_TAG = "</mediawiki>";
    private static final String DISAMBIGUATION_PREFIX = "(" + Collections.getFirst(Configuration.getInstance().getDisambiguationPrefixes()) + ")";
    private static final Set<String> SHORT_DISAMBIGUATION_PREFIX = Configuration.getInstance().getShortDisambiguationPrefix();
    private static final Log log = LogFactory.getLog(WikiDumpIterator.class);
    private String path2Dump;
    private InputStreamReader isr;
    private BufferedReader brXMLDump;
    private String untrimmedLine;
    private boolean read;

    public WikiDumpIterator(String xmlSource) {
        this.path2Dump = xmlSource;
    }

    public void open() throws IOException {
        log.info((Object)("OPENING WIKI ITERATOR @ " + this.path2Dump));
        this.isr = new InputStreamReader((InputStream)new FileInputStream(this.path2Dump), "UTF-8");
        this.brXMLDump = new BufferedReader(this.isr, Configuration.getInstance().getBufferSizeReader() * Constants.KBYTE);
    }

    public void close() throws IOException {
        log.info((Object)("CLOSING WIKI ITERATOR @ " + this.path2Dump));
        this.brXMLDump.close();
        this.isr.close();
    }

    @Override
    public boolean hasNext() {
        boolean test = false;
        try {
            if (!this.read) {
                this.untrimmedLine = this.brXMLDump.readLine();
            }
            if (this.untrimmedLine != null && !this.untrimmedLine.equals(XML_END_TAG)) {
                test = true;
            }
        }
        catch (IOException e) {
            e.printStackTrace();
        }
        return test;
    }

    @Override
    public WikiPageInfo next() {
        return this.next(true);
    }

    public WikiPageInfo next(boolean enableSkipping) {
        WikiPageInfo wp = null;
        if (!this.read) {
            try {
                this.untrimmedLine = this.brXMLDump.readLine();
                this.read = false;
            }
            catch (IOException ioe) {
                ioe.printStackTrace();
            }
        }
        long currentOffset = 0L;
        boolean bSkip = false;
        boolean bText = false;
        StringBuffer text = null;
        String id = null;
        String title = null;
        String lemma = null;
        String offsets = null;
        if (this.untrimmedLine != null) {
            wp = new WikiPageInfo();
            while (this.untrimmedLine != null) {
                int idx;
                int untrimmedByteLength = this.untrimmedLine.getBytes().length;
                currentOffset += (long)(untrimmedByteLength + 1);
                String line = this.untrimmedLine.trim();
                if (this.untrimmedLine.startsWith("  <page>")) {
                    wp = new WikiPageInfo();
                } else if (this.untrimmedLine.startsWith("    <id>")) {
                    idx = line.indexOf("</", 4);
                    id = line.substring(4, idx);
                    offsets = null;
                    wp.setId(id);
                } else if (line.startsWith("<title>")) {
                    idx = line.indexOf("</title>", 7);
                    title = line.substring(7, idx);
                    wp.setTitle(title);
                    offsets = null;
                    if (WikiTokenizer.isNamespace(title)) {
                        wp.setNamespace(true);
                        bSkip = true;
                    }
                    if (!bSkip || !enableSkipping) {
                        if (title.endsWith(")")) {
                            if (title.endsWith(DISAMBIGUATION_PREFIX)) {
                                wp.setDisambiguationPage(true);
                                bSkip = true;
                            } else {
                                lemma = this.getCleanedTitle(title);
                            }
                        } else {
                            lemma = title;
                        }
                        if (lemma != null) {
                            wp.setLemma(lemma.toLowerCase());
                        } else {
                            wp.setLemma("");
                        }
                    }
                } else if (line.startsWith("<text") && (idx = line.indexOf("/>", 5)) == -1) {
                    idx = line.indexOf(">", 5);
                    line = line.substring(idx + 1);
                    int startIdx = this.untrimmedLine.indexOf("<text");
                    startIdx = this.untrimmedLine.indexOf(">", startIdx + 5);
                    int startByteIdx = this.untrimmedLine.substring(0, startIdx).getBytes().length;
                    offsets = currentOffset - (long)(untrimmedByteLength - startByteIdx) + "\t";
                    bText = true;
                    text = new StringBuffer();
                }
                int endText = line.indexOf("</text");
                if (bText) {
                    int last = endText;
                    if (last == -1) {
                        last = line.length();
                    }
                    text.append(line.substring(0, last));
                    text.append("\n");
                    if (endText != -1) {
                        int endIdx = this.untrimmedLine.indexOf("</text");
                        int endByteIdx = this.untrimmedLine.substring(0, endIdx).getBytes().length;
                        offsets = String.valueOf(offsets) + (currentOffset - (long)(untrimmedByteLength - endByteIdx) - 1L);
                        bText = false;
                        if (!bSkip || !enableSkipping) {
                            if (Strings.containsOneOf(text.toString(), SHORT_DISAMBIGUATION_PREFIX)) {
                                wp.setDisambiguationPage(true);
                            } else {
                                wp.setText(text.toString());
                                wp.setOffset(offsets);
                            }
                            try {
                                this.untrimmedLine = this.brXMLDump.readLine();
                                this.untrimmedLine = this.brXMLDump.readLine();
                            }
                            catch (IOException e) {
                                this.untrimmedLine = null;
                                e.printStackTrace();
                            }
                            break;
                        }
                        bSkip = false;
                    }
                }
                try {
                    this.untrimmedLine = this.brXMLDump.readLine();
                }
                catch (IOException e) {
                    this.untrimmedLine = null;
                    e.printStackTrace();
                }
            }
        }
        return wp;
    }

    /*
     * Enabled aggressive block sorting
     * Enabled unnecessary exception pruning
     * Enabled aggressive exception aggregation
     */
    public String nextPage() {
        try {
            StringBuffer result = new StringBuffer();
            if (!this.read) {
                try {
                    this.untrimmedLine = this.brXMLDump.readLine();
                    this.read = false;
                }
                catch (IOException e) {
                    e.printStackTrace();
                }
            }
            if (this.untrimmedLine == null) {
                return null;
            }
            while (!this.untrimmedLine.contains("<page>") && this.brXMLDump.ready()) {
                this.untrimmedLine = this.brXMLDump.readLine();
            }
            result.append(String.valueOf(this.untrimmedLine) + "\n");
            while (!this.untrimmedLine.contains("</page>") && this.brXMLDump.ready()) {
                this.untrimmedLine = this.brXMLDump.readLine();
                result.append(String.valueOf(this.untrimmedLine) + "\n");
            }
            return result.toString();
        }
        catch (IOException e) {
            e.printStackTrace();
            return null;
        }
    }

    /*
     * Unable to fully structure code
     */
    public String nextDisambiguationPage() {
        result = this.nextPage();
        if (result != null) ** GOTO lbl7
        return null;
lbl-1000:
        // 1 sources

        {
            result = this.nextPage();
            if (result != null) continue;
            return null;
lbl7:
            // 2 sources

            ** while (!Strings.containsOneOf((String)result, WikiDumpIterator.SHORT_DISAMBIGUATION_PREFIX))
        }
lbl8:
        // 1 sources

        return result;
    }

    private String getCleanedTitle(String title) {
        String s = title;
        int p = title.lastIndexOf("(") - 1;
        if (p > 0) {
            char c = title.charAt(p);
            s = c == ' ' || c == '_' ? title.substring(0, p).trim() : title.substring(0, p + 1).trim();
        }
        return s;
    }

    @Override
    public void remove() {
        throw new RuntimeException("Unsupported operation 'remove'");
    }

    public static void main(String[] args) throws IOException {
        WikiDumpIterator i = new WikiDumpIterator(Configuration.getInstance().getWikipediaXMLDump());
        i.open();
        int pageCount = 0;
        while (i.hasNext()) {
            WikiPageInfo page = i.next();
            ++pageCount;
            if (page.getTitle().contains("Alien (band)")) {
                System.out.println(page.getTitle());
                break;
            }
            if (pageCount % 10000 != 0) continue;
            System.out.println("Analizzati " + pageCount + " documenti");
        }
        i.close();
    }
}

