/*
 * Decompiled with CFR 0.152.
 */
package it.uniroma1.lcl.jlt.wiki.iterator;

import it.uniroma1.lcl.jlt.Configuration;
import it.uniroma1.lcl.jlt.util.DirectoryFileManager;
import it.uniroma1.lcl.jlt.util.Language;
import it.uniroma1.lcl.jlt.wiki.data.WikiPageInfo;
import it.uniroma1.lcl.jlt.wiki.iterator.WikiDumpIterator;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

public class WikiDumpExtractor {
    private static final Log log = LogFactory.getLog(WikiDumpExtractor.class);
    private String path2Dump;

    public WikiDumpExtractor(String xmlSource) {
        this.path2Dump = xmlSource;
    }

    public Set<WikiPageInfo> getPages(Set<String> titles) throws IOException {
        WikiDumpIterator iterator = new WikiDumpIterator(this.path2Dump);
        HashSet<WikiPageInfo> pages = new HashSet<WikiPageInfo>();
        iterator.open();
        while (iterator.hasNext()) {
            WikiPageInfo info = iterator.next(false);
            if (!titles.contains(info.getTitle())) continue;
            pages.add(info);
        }
        iterator.close();
        return pages;
    }

    public void saveText(Collection<String> ids, String outDir) throws IOException {
        boolean bAll = ids.isEmpty();
        DirectoryFileManager manager = new DirectoryFileManager(outDir, 100);
        WikiDumpIterator iterator = new WikiDumpIterator(this.path2Dump);
        iterator.open();
        int i = 0;
        while (iterator.hasNext()) {
            if (!bAll && ids.isEmpty()) break;
            WikiPageInfo info = iterator.next(false);
            String id = info.getId();
            if (bAll || ids.contains(id)) {
                ids.remove(id);
                String fileName = String.valueOf(id) + ".wiki";
                FileWriter writer = manager.getFileWriter(fileName);
                writer.write(info.getText());
                writer.flush();
                writer.close();
            }
            if (i % 100000 == 0) {
                log.info((Object)("[SAVE-TEXT] PROCESSED " + i + " PAGES SO FAR..."));
            }
            ++i;
        }
        iterator.close();
    }

    public Set<String> getIds() throws IOException {
        HashSet<String> ids = new HashSet<String>();
        WikiDumpIterator iterator = new WikiDumpIterator(this.path2Dump);
        iterator.open();
        int i = 0;
        while (iterator.hasNext()) {
            WikiPageInfo info = iterator.next(false);
            if (info.isDisambiguationPage() || info.isRedirectionPage() || info.isNamespace()) continue;
            ids.add(info.getId());
            if (i % 100000 == 0) {
                log.info((Object)("[GET-ID] PROCESSED " + i + " PAGES SO FAR..."));
            }
            ++i;
        }
        iterator.close();
        return ids;
    }

    public static void main(String[] args) {
        try {
            String outDir = "tmp/wikiText";
            if (args.length > 0) {
                outDir = args[0];
            }
            int sampleSize = 1000;
            if (args.length > 1) {
                sampleSize = Integer.valueOf(args[1]);
            }
            WikiDumpExtractor extractor = new WikiDumpExtractor(Configuration.getInstance().getWikipediaXMLDump(Language.EN));
            ArrayList<String> ids = new ArrayList<String>(extractor.getIds());
            Collections.shuffle(ids);
            ArrayList<String> sample = new ArrayList<String>();
            for (String id : ids) {
                if (sample.size() == sampleSize) break;
                sample.add(id);
            }
            extractor.saveText(sample, outDir);
        }
        catch (Exception e) {
            e.printStackTrace();
        }
    }
}

