/*
 * Decompiled with CFR 0.152.
 */
package it.uniroma1.lcl.jlt.wiki;

import it.uniroma1.lcl.jlt.Configuration;
import it.uniroma1.lcl.jlt.util.Language;
import it.uniroma1.lcl.jlt.wiki.WikiParsingUtility;
import it.uniroma1.lcl.jlt.wiki.WikiTokenizer;
import it.uniroma1.lcl.jlt.wiki.data.WikiPageInfo;
import it.uniroma1.lcl.jlt.wiki.iterator.WikiDumpIterator;
import java.io.FileWriter;
import java.io.IOException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

public class WikiGlossExtractor {
    private static final Log log = LogFactory.getLog(WikiGlossExtractor.class);
    private String path2Dump;

    public WikiGlossExtractor(String xmlSource) {
        this.path2Dump = xmlSource;
    }

    public void getGlosses(String outFile, Language language) throws IOException {
        log.info((Object)("EXTRACTING GLOSSES FROM " + this.path2Dump));
        log.info((Object)("SAVING TO " + outFile));
        WikiDumpIterator iterator = new WikiDumpIterator(this.path2Dump);
        iterator.open();
        FileWriter writer = new FileWriter(outFile);
        int counter = 0;
        while (iterator.hasNext()) {
            WikiPageInfo info = iterator.next(false);
            if (info.isDisambiguationPage() || info.isRedirectionPage() || info.isNamespace()) continue;
            String id = info.getId();
            String title = info.getTitle();
            String rawText = info.getText();
            String lemma = info.getLemma();
            String gloss = this.extractGloss(id, title, lemma, rawText, language);
            if (!gloss.isEmpty()) {
                writer.write(gloss);
            }
            if (++counter % 10000 != 0) continue;
            log.info((Object)("EXTRACTED " + counter + " GLOSSES SO FAR"));
        }
        writer.flush();
        writer.close();
    }

    private String extractGloss(String id, String title, String lemma, String rawText, Language language) throws IOException {
        String cleanRawText = WikiGlossExtractor.preMassageRawText(rawText, language);
        WikiTokenizer wt = new WikiTokenizer(cleanRawText, id);
        String cleanTxt = wt.getText(false);
        cleanTxt = cleanTxt.replaceAll("([0-9]) \\.", "$1.");
        cleanTxt = cleanTxt.replaceAll("\\. ([0-9])", ".$1");
        StringBuffer sb = new StringBuffer();
        int dot = cleanTxt.indexOf(". ");
        if (dot != -1) {
            sb.append(id).append("\t").append(title).append("\t").append(lemma).append("\t");
            StringBuffer sb2 = new StringBuffer();
            String gloss = cleanTxt.substring(0, dot + 2);
            String[] stringArray = gloss.split(" ");
            int n = stringArray.length;
            int n2 = 0;
            while (n2 < n) {
                String token = stringArray[n2];
                if (sb2.length() > 0) {
                    sb2.append(" ");
                }
                sb2.append(WikiParsingUtility.extractText(token));
                ++n2;
            }
            String finalGloss = sb2.toString();
            int sentenceLength = (finalGloss = WikiGlossExtractor.cleanGloss(finalGloss, language)).split("\\s+").length;
            if (sentenceLength < 5) {
                log.warn((Object)("DROPPING SHORT GLOSS OF LENGTH " + sentenceLength + ": " + sb2));
            } else {
                sb.append(finalGloss);
            }
            sb.append("\n");
        }
        return sb.toString();
    }

    public static String preMassageRawText(String rawText, Language language) {
        String tmpRawText = rawText;
        switch (language) {
            case IT: 
            case ES: 
            case FR: 
            case CA: {
                tmpRawText = tmpRawText.replaceAll("([lL])'", " $1\u2019");
                tmpRawText = tmpRawText.replaceAll("([Dd])'", " $1\u2019");
                tmpRawText = tmpRawText.replaceAll("([Uu]n)'", " $1\u2019");
                break;
            }
        }
        tmpRawText = tmpRawText.replaceAll("(nowiki&gt;)'(&lt;/nowiki)", "$1\u2019$2");
        return tmpRawText;
    }

    private static String cleanGloss(String gloss, Language language) {
        String finalGloss = gloss;
        switch (language) {
            case IT: {
                finalGloss = finalGloss.replaceAll("Aiuto.*--", "");
                break;
            }
        }
        finalGloss = finalGloss.replaceAll("\u2019\\s?", "'");
        finalGloss = finalGloss.replaceAll("__NOTOC__", "");
        finalGloss = finalGloss.trim();
        return finalGloss;
    }

    public static void main(String[] args) {
        try {
            Configuration config = Configuration.getInstance();
            Language[] languageArray = new Language[]{Language.CA, Language.DE, Language.EN, Language.ES, Language.FR, Language.IT};
            int n = languageArray.length;
            int n2 = 0;
            while (n2 < n) {
                Language language = languageArray[n2];
                String inFile = config.getWikipediaXMLDump(language);
                String outFile = config.getWikiGlossDump(language);
                WikiGlossExtractor extractor = new WikiGlossExtractor(inFile);
                extractor.getGlosses(outFile, language);
                ++n2;
            }
        }
        catch (Exception e) {
            e.printStackTrace();
        }
    }
}

